opener-property-tagger 3.3.6 → 3.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 2d223b859da75d45b68da9def103878187f20a8f95c780eaf73556d5b4d3eb5e
         | 
| 4 | 
            +
              data.tar.gz: 021f008d00c3cbcf640f694703ca4308f77d080e0ab85ff72c89570a9f6987e7
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: fce119a3e41bc3647816c45fb328d3c626ebc7d8bbfe2d2372931d612da3aa7db1e8cd6591c02c6d512532736e5b820ae707fa7616114028165e37ceb073e495
         | 
| 7 | 
            +
              data.tar.gz: 307786c8aa77e7785486d1e8eebc506dd7096861702855d2f2f0d2fe12b216be3764900de415e492f9d236536b41204e6cfe566cae8a92fa0b71b39363cf41f6
         | 
| @@ -33,12 +33,17 @@ module Opener | |
| 33 33 | 
             
                  # @param [String] path
         | 
| 34 34 | 
             
                  #
         | 
| 35 35 | 
             
                  def load_aspects(path)
         | 
| 36 | 
            -
                    mapping = Hash.new | 
| 36 | 
            +
                    mapping = Hash.new{ |hash, key| hash[key] = [] }
         | 
| 37 37 |  | 
| 38 | 
            -
                    File.foreach | 
| 39 | 
            -
                      lemma,  | 
| 38 | 
            +
                    File.foreach path do |line|
         | 
| 39 | 
            +
                      lemma, pos, aspect = line.chomp.split("\t")
         | 
| 40 | 
            +
                      l = Hashie::Mash.new(
         | 
| 41 | 
            +
                        lemma:  lemma,
         | 
| 42 | 
            +
                        pos:    pos,
         | 
| 43 | 
            +
                        aspect: aspect,
         | 
| 44 | 
            +
                      )
         | 
| 40 45 |  | 
| 41 | 
            -
                      mapping[lemma.to_sym] <<  | 
| 46 | 
            +
                      mapping[l.lemma.to_sym] << l
         | 
| 42 47 | 
             
                    end
         | 
| 43 48 |  | 
| 44 49 | 
             
                    return mapping
         | 
| @@ -6,7 +6,8 @@ module Opener | |
| 6 6 | 
             
                class Processor
         | 
| 7 7 |  | 
| 8 8 | 
             
                  attr_accessor :document
         | 
| 9 | 
            -
                  attr_accessor : | 
| 9 | 
            +
                  attr_accessor :aspects_path, :aspects_url
         | 
| 10 | 
            +
                  attr_accessor :aspects, :lexicons
         | 
| 10 11 | 
             
                  attr_accessor :timestamp, :pretty
         | 
| 11 12 |  | 
| 12 13 | 
             
                  ##
         | 
| @@ -34,10 +35,10 @@ module Opener | |
| 34 35 | 
             
                    @remote       = !url.nil?
         | 
| 35 36 | 
             
                    @aspects_path = path
         | 
| 36 37 | 
             
                    @aspects_url  = url
         | 
| 37 | 
            -
                    @cache_keys   = params[:cache_keys]
         | 
| 38 | 
            +
                    @cache_keys   = params[:cache_keys] || {}
         | 
| 38 39 | 
             
                    @cache_keys.merge! lang: @document.root.attr('xml:lang')
         | 
| 39 40 |  | 
| 40 | 
            -
                    @ | 
| 41 | 
            +
                    @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
         | 
| 41 42 | 
             
                  end
         | 
| 42 43 |  | 
| 43 44 | 
             
                  ##
         | 
| @@ -45,15 +46,13 @@ module Opener | |
| 45 46 | 
             
                  # @return [String]
         | 
| 46 47 | 
             
                  #
         | 
| 47 48 | 
             
                  def process
         | 
| 48 | 
            -
                    existing_aspects = extract_aspects
         | 
| 49 | 
            -
             | 
| 50 49 | 
             
                    add_features_layer
         | 
| 51 50 | 
             
                    add_properties_layer
         | 
| 52 51 |  | 
| 53 | 
            -
                     | 
| 52 | 
            +
                    extract_aspects.each.with_index do |(lemma, values), index|
         | 
| 54 53 | 
             
                      index += 1
         | 
| 55 54 |  | 
| 56 | 
            -
                      add_property | 
| 55 | 
            +
                      add_property lemma, values, index
         | 
| 57 56 | 
             
                    end
         | 
| 58 57 |  | 
| 59 58 | 
             
                    add_linguistic_processor
         | 
| @@ -77,37 +76,41 @@ module Opener | |
| 77 76 | 
             
                    @terms
         | 
| 78 77 | 
             
                  end
         | 
| 79 78 |  | 
| 79 | 
            +
                  # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
         | 
| 80 | 
            +
                  # lemmas) belong to a property.
         | 
| 81 | 
            +
                  MAX_NGRAM = 2
         | 
| 82 | 
            +
             | 
| 80 83 | 
             
                  ##
         | 
| 81 84 | 
             
                  # Check which terms belong to an aspect (property)
         | 
| 82 85 | 
             
                  # Text have priority over Lemmas, overriding if there is a conflict
         | 
| 83 86 | 
             
                  # @return [Hash]
         | 
| 84 87 | 
             
                  #
         | 
| 85 88 | 
             
                  def extract_aspects
         | 
| 86 | 
            -
                     | 
| 89 | 
            +
                    all_term_ids = terms.keys
         | 
| 87 90 | 
             
                    lemmas       = terms.values
         | 
| 88 | 
            -
                    uniq_aspects = Hash.new | 
| 91 | 
            +
                    uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
         | 
| 89 92 |  | 
| 90 93 | 
             
                    [:lemma, :text].each do |k|
         | 
| 91 94 | 
             
                      current_token = 0
         | 
| 92 | 
            -
                      # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
         | 
| 93 | 
            -
                      # lemmas) belong to a property.
         | 
| 94 | 
            -
                      max_ngram = 2
         | 
| 95 | 
            -
             | 
| 96 95 |  | 
| 97 96 | 
             
                      while current_token < terms.count
         | 
| 98 | 
            -
                        (0.. | 
| 99 | 
            -
                           | 
| 100 | 
            -
             | 
| 101 | 
            -
             | 
| 102 | 
            -
             | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
| 106 | 
            -
                               | 
| 107 | 
            -
             | 
| 108 | 
            -
             | 
| 109 | 
            -
             | 
| 110 | 
            -
                               | 
| 97 | 
            +
                        (0..MAX_NGRAM).each do |tam_ngram|
         | 
| 98 | 
            +
                          next unless current_token + tam_ngram <= terms.count
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                          ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                          @lexicons[ngram.to_sym]&.each do |l|
         | 
| 103 | 
            +
                            properties = if l.aspects.present? then l.aspects else [l.aspect] end
         | 
| 104 | 
            +
                            properties.each do |p|
         | 
| 105 | 
            +
                              next if p.blank?
         | 
| 106 | 
            +
                              term_ids = all_term_ids[current_token..current_token+tam_ngram]
         | 
| 107 | 
            +
                              next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                              uniq_aspects[p.to_sym] << Hashie::Mash.new(
         | 
| 110 | 
            +
                                term_ids: term_ids,
         | 
| 111 | 
            +
                                ngram:    ngram,
         | 
| 112 | 
            +
                                lexicon:  l,
         | 
| 113 | 
            +
                              )
         | 
| 111 114 | 
             
                            end
         | 
| 112 115 | 
             
                          end
         | 
| 113 116 | 
             
                        end
         | 
| @@ -135,24 +138,25 @@ module Opener | |
| 135 138 | 
             
                    new_node("properties", "KAF/features")
         | 
| 136 139 | 
             
                  end
         | 
| 137 140 |  | 
| 138 | 
            -
                  def add_property | 
| 141 | 
            +
                  def add_property lemma, values, index
         | 
| 139 142 | 
             
                    property_node = new_node("property", "KAF/features/properties")
         | 
| 140 143 |  | 
| 141 | 
            -
                    property_node['lemma'] =  | 
| 144 | 
            +
                    property_node['lemma'] = lemma.to_s
         | 
| 142 145 | 
             
                    property_node['pid']   = "p#{index.to_s}"
         | 
| 143 146 |  | 
| 144 147 | 
             
                    references_node = new_node("references", property_node)
         | 
| 145 148 |  | 
| 146 | 
            -
                     | 
| 147 | 
            -
                      comm_node = Nokogiri::XML::Comment.new(references_node, " #{v. | 
| 149 | 
            +
                    values.each do |v|
         | 
| 150 | 
            +
                      comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
         | 
| 148 151 | 
             
                      references_node.add_child comm_node
         | 
| 149 152 |  | 
| 150 | 
            -
                      span_node = new_node | 
| 153 | 
            +
                      span_node = new_node 'span', references_node
         | 
| 151 154 |  | 
| 152 | 
            -
                      v. | 
| 153 | 
            -
                        target_node       = new_node | 
| 155 | 
            +
                      v.term_ids.each do |id|
         | 
| 156 | 
            +
                        target_node       = new_node 'target', span_node
         | 
| 154 157 |  | 
| 155 | 
            -
                        target_node['id'] =  | 
| 158 | 
            +
                        target_node['id'] = id.to_s
         | 
| 159 | 
            +
                        target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
         | 
| 156 160 | 
             
                      end
         | 
| 157 161 | 
             
                    end
         | 
| 158 162 | 
             
                  end
         | 
| @@ -17,10 +17,11 @@ module Opener | |
| 17 17 | 
             
                  end
         | 
| 18 18 |  | 
| 19 19 | 
             
                  def [] **params
         | 
| 20 | 
            +
                    existing = @cache[params]
         | 
| 21 | 
            +
                    return existing if existing and existing.from > UPDATE_INTERVAL.ago
         | 
| 22 | 
            +
             | 
| 20 23 | 
             
                    synchronize do
         | 
| 21 | 
            -
                       | 
| 22 | 
            -
                      break existing if existing and existing.from > UPDATE_INTERVAL.ago
         | 
| 23 | 
            -
                      @cache[params] = cache_update existing, **params
         | 
| 24 | 
            +
                      @cache[params] = cache_update @cache[params], **params
         | 
| 24 25 | 
             
                    end
         | 
| 25 26 | 
             
                  end
         | 
| 26 27 | 
             
                  alias_method :get, :[]
         | 
| @@ -49,7 +50,10 @@ module Opener | |
| 49 50 | 
             
                    lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
         | 
| 50 51 | 
             
                    mapping  = Hash.new{ |hash, key| hash[key] = [] }
         | 
| 51 52 | 
             
                    lexicons.each do |l|
         | 
| 52 | 
            -
                      mapping[l.lemma.to_sym] << l | 
| 53 | 
            +
                      mapping[l.lemma.to_sym] << l
         | 
| 54 | 
            +
                      l.variants&.each do |v|
         | 
| 55 | 
            +
                        mapping[v.lemma.to_sym] << l
         | 
| 56 | 
            +
                      end
         | 
| 53 57 | 
             
                    end
         | 
| 54 58 |  | 
| 55 59 | 
             
                    mapping
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: opener-property-tagger
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 3.3 | 
| 4 | 
            +
              version: 3.4.3
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - development@olery.com
         | 
| 8 8 | 
             
            autorequire:
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2021-09-03 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: opener-daemons
         | 
| @@ -167,9 +167,9 @@ dependencies: | |
| 167 167 | 
             
            description: Property tagger for hotels in Dutch and English.
         | 
| 168 168 | 
             
            email:
         | 
| 169 169 | 
             
            executables:
         | 
| 170 | 
            -
            - property-tagger
         | 
| 171 170 | 
             
            - property-tagger-daemon
         | 
| 172 171 | 
             
            - property-tagger-server
         | 
| 172 | 
            +
            - property-tagger
         | 
| 173 173 | 
             
            extensions: []
         | 
| 174 174 | 
             
            extra_rdoc_files: []
         | 
| 175 175 | 
             
            files:
         | 
| @@ -212,8 +212,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 212 212 | 
             
                - !ruby/object:Gem::Version
         | 
| 213 213 | 
             
                  version: '0'
         | 
| 214 214 | 
             
            requirements: []
         | 
| 215 | 
            -
             | 
| 216 | 
            -
            rubygems_version: 2.7.8
         | 
| 215 | 
            +
            rubygems_version: 3.2.14
         | 
| 217 216 | 
             
            signing_key:
         | 
| 218 217 | 
             
            specification_version: 4
         | 
| 219 218 | 
             
            summary: Property tagger for hotels in Dutch and English.
         |