opener-property-tagger 3.3.6 → 3.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0bdc478005838e629b52e310437c5cac89158b8f33429d477c7096f1210c3249
4
- data.tar.gz: a8fec4e6229cfd39f1cc0f11c3a8007c2c40442fd6ed2f00a558d2cbd8f01c8d
3
+ metadata.gz: 2d223b859da75d45b68da9def103878187f20a8f95c780eaf73556d5b4d3eb5e
4
+ data.tar.gz: 021f008d00c3cbcf640f694703ca4308f77d080e0ab85ff72c89570a9f6987e7
5
5
  SHA512:
6
- metadata.gz: a88d97bd1fff6f57c956024d7d47affe13179a02e068129ddda1be0c577324eafee18c271e465d68dc06a99d7cdd78288e6094f307ab4503ee68c963e618bafc
7
- data.tar.gz: 6e601c83e09931821e6d00c03114fbaabb72f8c72781f667eea384b7beb916adc3ccc42f83dbd15434779e2dd3e5f574bcb8ef7be022f3154f0afdabec0107b2
6
+ metadata.gz: fce119a3e41bc3647816c45fb328d3c626ebc7d8bbfe2d2372931d612da3aa7db1e8cd6591c02c6d512532736e5b820ae707fa7616114028165e37ceb073e495
7
+ data.tar.gz: 307786c8aa77e7785486d1e8eebc506dd7096861702855d2f2f0d2fe12b216be3764900de415e492f9d236536b41204e6cfe566cae8a92fa0b71b39363cf41f6
@@ -33,12 +33,17 @@ module Opener
33
33
  # @param [String] path
34
34
  #
35
35
  def load_aspects(path)
36
- mapping = Hash.new { |hash, key| hash[key] = [] }
36
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
37
37
 
38
- File.foreach(path) do |line|
39
- lemma, _pos, aspect = line.chomp.split("\t")
38
+ File.foreach path do |line|
39
+ lemma, pos, aspect = line.chomp.split("\t")
40
+ l = Hashie::Mash.new(
41
+ lemma: lemma,
42
+ pos: pos,
43
+ aspect: aspect,
44
+ )
40
45
 
41
- mapping[lemma.to_sym] << aspect
46
+ mapping[l.lemma.to_sym] << l
42
47
  end
43
48
 
44
49
  return mapping
@@ -6,7 +6,8 @@ module Opener
6
6
  class Processor
7
7
 
8
8
  attr_accessor :document
9
- attr_accessor :aspects, :aspects_path, :aspects_url
9
+ attr_accessor :aspects_path, :aspects_url
10
+ attr_accessor :aspects, :lexicons
10
11
  attr_accessor :timestamp, :pretty
11
12
 
12
13
  ##
@@ -34,10 +35,10 @@ module Opener
34
35
  @remote = !url.nil?
35
36
  @aspects_path = path
36
37
  @aspects_url = url
37
- @cache_keys = params[:cache_keys]
38
+ @cache_keys = params[:cache_keys] || {}
38
39
  @cache_keys.merge! lang: @document.root.attr('xml:lang')
39
40
 
40
- @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
+ @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
42
  end
42
43
 
43
44
  ##
@@ -45,15 +46,13 @@ module Opener
45
46
  # @return [String]
46
47
  #
47
48
  def process
48
- existing_aspects = extract_aspects
49
-
50
49
  add_features_layer
51
50
  add_properties_layer
52
51
 
53
- existing_aspects.each_with_index do |(key, value), index|
52
+ extract_aspects.each.with_index do |(lemma, values), index|
54
53
  index += 1
55
54
 
56
- add_property(key, value, index)
55
+ add_property lemma, values, index
57
56
  end
58
57
 
59
58
  add_linguistic_processor
@@ -77,37 +76,41 @@ module Opener
77
76
  @terms
78
77
  end
79
78
 
79
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
80
+ # lemmas) belong to a property.
81
+ MAX_NGRAM = 2
82
+
80
83
  ##
81
84
  # Check which terms belong to an aspect (property)
82
85
  # Text have priority over Lemmas, overriding if there is a conflict
83
86
  # @return [Hash]
84
87
  #
85
88
  def extract_aspects
86
- term_ids = terms.keys
89
+ all_term_ids = terms.keys
87
90
  lemmas = terms.values
88
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
91
+ uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
89
92
 
90
93
  [:lemma, :text].each do |k|
91
94
  current_token = 0
92
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
93
- # lemmas) belong to a property.
94
- max_ngram = 2
95
-
96
95
 
97
96
  while current_token < terms.count
98
- (0..max_ngram).each do |tam_ngram|
99
- if current_token + tam_ngram <= terms.count
100
- ngram = lemmas[current_token..current_token+tam_ngram].map{|a| a[k] }.join(" ").downcase
101
-
102
- if aspects[ngram.to_sym]
103
- properties = aspects[ngram.to_sym]
104
- ids = term_ids[current_token..current_token+tam_ngram]
105
-
106
- properties.uniq.each do |property|
107
- next if !property or property.strip.empty?
108
-
109
- uniq_aspects[property.to_sym] << [ids,ngram] unless uniq_aspects[property.to_sym].include? [ids,ngram]
110
- end
97
+ (0..MAX_NGRAM).each do |tam_ngram|
98
+ next unless current_token + tam_ngram <= terms.count
99
+
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
101
+
102
+ @lexicons[ngram.to_sym]&.each do |l|
103
+ properties = if l.aspects.present? then l.aspects else [l.aspect] end
104
+ properties.each do |p|
105
+ next if p.blank?
106
+ term_ids = all_term_ids[current_token..current_token+tam_ngram]
107
+ next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
108
+
109
+ uniq_aspects[p.to_sym] << Hashie::Mash.new(
110
+ term_ids: term_ids,
111
+ ngram: ngram,
112
+ lexicon: l,
113
+ )
111
114
  end
112
115
  end
113
116
  end
@@ -135,24 +138,25 @@ module Opener
135
138
  new_node("properties", "KAF/features")
136
139
  end
137
140
 
138
- def add_property(key, value, index)
141
+ def add_property lemma, values, index
139
142
  property_node = new_node("property", "KAF/features/properties")
140
143
 
141
- property_node['lemma'] = key.to_s
144
+ property_node['lemma'] = lemma.to_s
142
145
  property_node['pid'] = "p#{index.to_s}"
143
146
 
144
147
  references_node = new_node("references", property_node)
145
148
 
146
- value.uniq.each do |v|
147
- comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.last} ")
149
+ values.each do |v|
150
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
148
151
  references_node.add_child comm_node
149
152
 
150
- span_node = new_node("span", references_node)
153
+ span_node = new_node 'span', references_node
151
154
 
152
- v.first.each do |val|
153
- target_node = new_node("target", span_node)
155
+ v.term_ids.each do |id|
156
+ target_node = new_node 'target', span_node
154
157
 
155
- target_node['id'] = val.to_s
158
+ target_node['id'] = id.to_s
159
+ target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
156
160
  end
157
161
  end
158
162
  end
@@ -17,10 +17,11 @@ module Opener
17
17
  end
18
18
 
19
19
  def [] **params
20
+ existing = @cache[params]
21
+ return existing if existing and existing.from > UPDATE_INTERVAL.ago
22
+
20
23
  synchronize do
21
- existing = @cache[params]
22
- break existing if existing and existing.from > UPDATE_INTERVAL.ago
23
- @cache[params] = cache_update existing, **params
24
+ @cache[params] = cache_update @cache[params], **params
24
25
  end
25
26
  end
26
27
  alias_method :get, :[]
@@ -49,7 +50,10 @@ module Opener
49
50
  lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
50
51
  mapping = Hash.new{ |hash, key| hash[key] = [] }
51
52
  lexicons.each do |l|
52
- mapping[l.lemma.to_sym] << l.aspect
53
+ mapping[l.lemma.to_sym] << l
54
+ l.variants&.each do |v|
55
+ mapping[v.lemma.to_sym] << l
56
+ end
53
57
  end
54
58
 
55
59
  mapping
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.6'
4
+ VERSION = '3.4.3'
5
5
 
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.6
4
+ version: 3.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-12-10 00:00:00.000000000 Z
11
+ date: 2021-09-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons
@@ -167,9 +167,9 @@ dependencies:
167
167
  description: Property tagger for hotels in Dutch and English.
168
168
  email:
169
169
  executables:
170
- - property-tagger
171
170
  - property-tagger-daemon
172
171
  - property-tagger-server
172
+ - property-tagger
173
173
  extensions: []
174
174
  extra_rdoc_files: []
175
175
  files:
@@ -212,8 +212,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
212
212
  - !ruby/object:Gem::Version
213
213
  version: '0'
214
214
  requirements: []
215
- rubyforge_project:
216
- rubygems_version: 2.7.8
215
+ rubygems_version: 3.2.14
217
216
  signing_key:
218
217
  specification_version: 4
219
218
  summary: Property tagger for hotels in Dutch and English.