opener-property-tagger 3.3.5 → 3.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4498897f273195326764dae62875d3901919f69014df57d0919d88b0755ee777
4
- data.tar.gz: e0c3c1cfcc32cf260024c2c4b5fee4afb7e6a2b4e46ba690499e8b151e959e52
3
+ metadata.gz: 181a83398a21579d625df2d6dfbb5e8d03759e2e285c40f3e6fe519f73f270b7
4
+ data.tar.gz: f2548258178f538a8f7e32e0617fd5f8bea8399373399529fab6316906739543
5
5
  SHA512:
6
- metadata.gz: 0f0557b5cb1604b6edec19c6d5c9b2d3cc892792d7562209d619cfe0f1f09c04fa0a340f3197e8f5963d0dbefbfb890d18e9051f82705eaa346f20682b724eb1
7
- data.tar.gz: 828fb63990b968d4511548b5ed6b05a44574cc9b0ed933745ddb7053cc642400ddf1ff7966f50a55b6dc84223a1fbbb8882c9cb28850dd3a8e98e550e9453183
6
+ metadata.gz: e3817f4e33c9aedc67d767e4e194a76a7b3d5f87331e7722916dd3f84c4475f51f04763a9dfc7561092eab623aeee2f90c2ab7b4b2a8caf2b99e34708ebe3995
7
+ data.tar.gz: bb42edeff3f5225972d475c4522da2b91b99484edb94d2a14b646ea7712964e0f22858819e7b0affb4b59ecfd87f4823035ab13d804a76c4b377684bfb7ea7c1
@@ -33,12 +33,17 @@ module Opener
33
33
  # @param [String] path
34
34
  #
35
35
  def load_aspects(path)
36
- mapping = Hash.new { |hash, key| hash[key] = [] }
36
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
37
37
 
38
- File.foreach(path) do |line|
39
- lemma, _pos, aspect = line.chomp.split("\t")
38
+ File.foreach path do |line|
39
+ lemma, pos, aspect = line.chomp.split("\t")
40
+ l = Hashie::Mash.new(
41
+ lemma: lemma,
42
+ pos: pos,
43
+ aspect: aspect,
44
+ )
40
45
 
41
- mapping[lemma.to_sym] << aspect
46
+ mapping[l.lemma.to_sym] << l
42
47
  end
43
48
 
44
49
  return mapping
@@ -6,7 +6,8 @@ module Opener
6
6
  class Processor
7
7
 
8
8
  attr_accessor :document
9
- attr_accessor :aspects, :aspects_path, :aspects_url
9
+ attr_accessor :aspects_path, :aspects_url
10
+ attr_accessor :aspects, :lexicons
10
11
  attr_accessor :timestamp, :pretty
11
12
 
12
13
  ##
@@ -31,12 +32,13 @@ module Opener
31
32
  @pretty = pretty
32
33
 
33
34
  @params = params
34
- @cache_keys = params[:cache_keys] || {lang: language}
35
35
  @remote = !url.nil?
36
36
  @aspects_path = path
37
37
  @aspects_url = url
38
+ @cache_keys = params[:cache_keys] || {}
39
+ @cache_keys.merge! lang: @document.root.attr('xml:lang')
38
40
 
39
- @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
+ @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
40
42
  end
41
43
 
42
44
  ##
@@ -44,15 +46,13 @@ module Opener
44
46
  # @return [String]
45
47
  #
46
48
  def process
47
- existing_aspects = extract_aspects
48
-
49
49
  add_features_layer
50
50
  add_properties_layer
51
51
 
52
- existing_aspects.each_with_index do |(key, value), index|
52
+ extract_aspects.each.with_index do |(lemma, values), index|
53
53
  index += 1
54
54
 
55
- add_property(key, value, index)
55
+ add_property lemma, values, index
56
56
  end
57
57
 
58
58
  add_linguistic_processor
@@ -76,36 +76,41 @@ module Opener
76
76
  @terms
77
77
  end
78
78
 
79
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
80
+ # lemmas) belong to a property.
81
+ MAX_NGRAM = 2
82
+
79
83
  ##
80
84
  # Check which terms belong to an aspect (property)
85
+ # Text have priority over Lemmas, overriding if there is a conflict
81
86
  # @return [Hash]
82
87
  #
83
88
  def extract_aspects
84
- term_ids = terms.keys
89
+ all_term_ids = terms.keys
85
90
  lemmas = terms.values
86
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
91
+ uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
87
92
 
88
93
  [:lemma, :text].each do |k|
89
94
  current_token = 0
90
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
91
- # lemmas) belong to a property.
92
- max_ngram = 2
93
-
94
95
 
95
96
  while current_token < terms.count
96
- (0..max_ngram).each do |tam_ngram|
97
- if current_token + tam_ngram <= terms.count
98
- ngram = lemmas[current_token..current_token+tam_ngram].map{|a| a[k] }.join(" ").downcase
99
-
100
- if aspects[ngram.to_sym]
101
- properties = aspects[ngram.to_sym]
102
- ids = term_ids[current_token..current_token+tam_ngram]
103
-
104
- properties.uniq.each do |property|
105
- next if !property or property.strip.empty?
106
-
107
- uniq_aspects[property.to_sym] << [ids,ngram] unless uniq_aspects[property.to_sym].include? [ids,ngram]
108
- end
97
+ (0..MAX_NGRAM).each do |tam_ngram|
98
+ next unless current_token + tam_ngram <= terms.count
99
+
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
101
+
102
+ @lexicons[ngram.to_sym]&.each do |l|
103
+ properties = if l.aspects.present? then l.aspects else [l.aspect] end
104
+ properties.each do |p|
105
+ next if p.blank?
106
+ term_ids = all_term_ids[current_token..current_token+tam_ngram]
107
+ next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
108
+
109
+ uniq_aspects[p.to_sym] << Hashie::Mash.new(
110
+ term_ids: term_ids,
111
+ ngram: ngram,
112
+ lexicon: l,
113
+ )
109
114
  end
110
115
  end
111
116
  end
@@ -133,24 +138,25 @@ module Opener
133
138
  new_node("properties", "KAF/features")
134
139
  end
135
140
 
136
- def add_property(key, value, index)
141
+ def add_property lemma, values, index
137
142
  property_node = new_node("property", "KAF/features/properties")
138
143
 
139
- property_node['lemma'] = key.to_s
144
+ property_node['lemma'] = lemma.to_s
140
145
  property_node['pid'] = "p#{index.to_s}"
141
146
 
142
147
  references_node = new_node("references", property_node)
143
148
 
144
- value.uniq.each do |v|
145
- comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.last} ")
149
+ values.each do |v|
150
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
146
151
  references_node.add_child comm_node
147
152
 
148
- span_node = new_node("span", references_node)
153
+ span_node = new_node 'span', references_node
149
154
 
150
- v.first.each do |val|
151
- target_node = new_node("target", span_node)
155
+ v.term_ids.each do |id|
156
+ target_node = new_node 'target', span_node
152
157
 
153
- target_node['id'] = val.to_s
158
+ target_node['id'] = id.to_s
159
+ target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
154
160
  end
155
161
  end
156
162
  end
@@ -17,10 +17,12 @@ module Opener
17
17
  end
18
18
 
19
19
  def [] **params
20
+ existing = @cache[params]
21
+ return existing if existing and existing.from > UPDATE_INTERVAL.ago
22
+ params[:contract_ids] = nil unless params[:contract_ids]
23
+
20
24
  synchronize do
21
- existing = @cache[params]
22
- break existing if existing and existing.from > UPDATE_INTERVAL.ago
23
- @cache[params] = cache_update existing, **params
25
+ @cache[params] = cache_update @cache[params], **params
24
26
  end
25
27
  end
26
28
  alias_method :get, :[]
@@ -49,7 +51,10 @@ module Opener
49
51
  lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
50
52
  mapping = Hash.new{ |hash, key| hash[key] = [] }
51
53
  lexicons.each do |l|
52
- mapping[l.lemma.to_sym] << l.aspect
54
+ mapping[l.lemma.to_sym] << l
55
+ l.variants&.each do |v|
56
+ mapping[v.lemma.to_sym] << l
57
+ end
53
58
  end
54
59
 
55
60
  mapping
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.5'
4
+ VERSION = '3.4.2'
5
5
 
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.5
4
+ version: 3.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-10 00:00:00.000000000 Z
11
+ date: 2021-09-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons
@@ -212,8 +212,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
212
212
  - !ruby/object:Gem::Version
213
213
  version: '0'
214
214
  requirements: []
215
- rubyforge_project:
216
- rubygems_version: 2.7.6
215
+ rubygems_version: 3.2.14
217
216
  signing_key:
218
217
  specification_version: 4
219
218
  summary: Property tagger for hotels in Dutch and English.