opener-property-tagger 3.3.5 → 3.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4498897f273195326764dae62875d3901919f69014df57d0919d88b0755ee777
4
- data.tar.gz: e0c3c1cfcc32cf260024c2c4b5fee4afb7e6a2b4e46ba690499e8b151e959e52
3
+ metadata.gz: 181a83398a21579d625df2d6dfbb5e8d03759e2e285c40f3e6fe519f73f270b7
4
+ data.tar.gz: f2548258178f538a8f7e32e0617fd5f8bea8399373399529fab6316906739543
5
5
  SHA512:
6
- metadata.gz: 0f0557b5cb1604b6edec19c6d5c9b2d3cc892792d7562209d619cfe0f1f09c04fa0a340f3197e8f5963d0dbefbfb890d18e9051f82705eaa346f20682b724eb1
7
- data.tar.gz: 828fb63990b968d4511548b5ed6b05a44574cc9b0ed933745ddb7053cc642400ddf1ff7966f50a55b6dc84223a1fbbb8882c9cb28850dd3a8e98e550e9453183
6
+ metadata.gz: e3817f4e33c9aedc67d767e4e194a76a7b3d5f87331e7722916dd3f84c4475f51f04763a9dfc7561092eab623aeee2f90c2ab7b4b2a8caf2b99e34708ebe3995
7
+ data.tar.gz: bb42edeff3f5225972d475c4522da2b91b99484edb94d2a14b646ea7712964e0f22858819e7b0affb4b59ecfd87f4823035ab13d804a76c4b377684bfb7ea7c1
@@ -33,12 +33,17 @@ module Opener
33
33
  # @param [String] path
34
34
  #
35
35
  def load_aspects(path)
36
- mapping = Hash.new { |hash, key| hash[key] = [] }
36
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
37
37
 
38
- File.foreach(path) do |line|
39
- lemma, _pos, aspect = line.chomp.split("\t")
38
+ File.foreach path do |line|
39
+ lemma, pos, aspect = line.chomp.split("\t")
40
+ l = Hashie::Mash.new(
41
+ lemma: lemma,
42
+ pos: pos,
43
+ aspect: aspect,
44
+ )
40
45
 
41
- mapping[lemma.to_sym] << aspect
46
+ mapping[l.lemma.to_sym] << l
42
47
  end
43
48
 
44
49
  return mapping
@@ -6,7 +6,8 @@ module Opener
6
6
  class Processor
7
7
 
8
8
  attr_accessor :document
9
- attr_accessor :aspects, :aspects_path, :aspects_url
9
+ attr_accessor :aspects_path, :aspects_url
10
+ attr_accessor :aspects, :lexicons
10
11
  attr_accessor :timestamp, :pretty
11
12
 
12
13
  ##
@@ -31,12 +32,13 @@ module Opener
31
32
  @pretty = pretty
32
33
 
33
34
  @params = params
34
- @cache_keys = params[:cache_keys] || {lang: language}
35
35
  @remote = !url.nil?
36
36
  @aspects_path = path
37
37
  @aspects_url = url
38
+ @cache_keys = params[:cache_keys] || {}
39
+ @cache_keys.merge! lang: @document.root.attr('xml:lang')
38
40
 
39
- @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
+ @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
40
42
  end
41
43
 
42
44
  ##
@@ -44,15 +46,13 @@ module Opener
44
46
  # @return [String]
45
47
  #
46
48
  def process
47
- existing_aspects = extract_aspects
48
-
49
49
  add_features_layer
50
50
  add_properties_layer
51
51
 
52
- existing_aspects.each_with_index do |(key, value), index|
52
+ extract_aspects.each.with_index do |(lemma, values), index|
53
53
  index += 1
54
54
 
55
- add_property(key, value, index)
55
+ add_property lemma, values, index
56
56
  end
57
57
 
58
58
  add_linguistic_processor
@@ -76,36 +76,41 @@ module Opener
76
76
  @terms
77
77
  end
78
78
 
79
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
80
+ # lemmas) belong to a property.
81
+ MAX_NGRAM = 2
82
+
79
83
  ##
80
84
  # Check which terms belong to an aspect (property)
85
+ # Text have priority over Lemmas, overriding if there is a conflict
81
86
  # @return [Hash]
82
87
  #
83
88
  def extract_aspects
84
- term_ids = terms.keys
89
+ all_term_ids = terms.keys
85
90
  lemmas = terms.values
86
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
91
+ uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
87
92
 
88
93
  [:lemma, :text].each do |k|
89
94
  current_token = 0
90
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
91
- # lemmas) belong to a property.
92
- max_ngram = 2
93
-
94
95
 
95
96
  while current_token < terms.count
96
- (0..max_ngram).each do |tam_ngram|
97
- if current_token + tam_ngram <= terms.count
98
- ngram = lemmas[current_token..current_token+tam_ngram].map{|a| a[k] }.join(" ").downcase
99
-
100
- if aspects[ngram.to_sym]
101
- properties = aspects[ngram.to_sym]
102
- ids = term_ids[current_token..current_token+tam_ngram]
103
-
104
- properties.uniq.each do |property|
105
- next if !property or property.strip.empty?
106
-
107
- uniq_aspects[property.to_sym] << [ids,ngram] unless uniq_aspects[property.to_sym].include? [ids,ngram]
108
- end
97
+ (0..MAX_NGRAM).each do |tam_ngram|
98
+ next unless current_token + tam_ngram <= terms.count
99
+
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
101
+
102
+ @lexicons[ngram.to_sym]&.each do |l|
103
+ properties = if l.aspects.present? then l.aspects else [l.aspect] end
104
+ properties.each do |p|
105
+ next if p.blank?
106
+ term_ids = all_term_ids[current_token..current_token+tam_ngram]
107
+ next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
108
+
109
+ uniq_aspects[p.to_sym] << Hashie::Mash.new(
110
+ term_ids: term_ids,
111
+ ngram: ngram,
112
+ lexicon: l,
113
+ )
109
114
  end
110
115
  end
111
116
  end
@@ -133,24 +138,25 @@ module Opener
133
138
  new_node("properties", "KAF/features")
134
139
  end
135
140
 
136
- def add_property(key, value, index)
141
+ def add_property lemma, values, index
137
142
  property_node = new_node("property", "KAF/features/properties")
138
143
 
139
- property_node['lemma'] = key.to_s
144
+ property_node['lemma'] = lemma.to_s
140
145
  property_node['pid'] = "p#{index.to_s}"
141
146
 
142
147
  references_node = new_node("references", property_node)
143
148
 
144
- value.uniq.each do |v|
145
- comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.last} ")
149
+ values.each do |v|
150
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
146
151
  references_node.add_child comm_node
147
152
 
148
- span_node = new_node("span", references_node)
153
+ span_node = new_node 'span', references_node
149
154
 
150
- v.first.each do |val|
151
- target_node = new_node("target", span_node)
155
+ v.term_ids.each do |id|
156
+ target_node = new_node 'target', span_node
152
157
 
153
- target_node['id'] = val.to_s
158
+ target_node['id'] = id.to_s
159
+ target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
154
160
  end
155
161
  end
156
162
  end
@@ -17,10 +17,12 @@ module Opener
17
17
  end
18
18
 
19
19
  def [] **params
20
+ existing = @cache[params]
21
+ return existing if existing and existing.from > UPDATE_INTERVAL.ago
22
+ params[:contract_ids] = nil unless params[:contract_ids]
23
+
20
24
  synchronize do
21
- existing = @cache[params]
22
- break existing if existing and existing.from > UPDATE_INTERVAL.ago
23
- @cache[params] = cache_update existing, **params
25
+ @cache[params] = cache_update @cache[params], **params
24
26
  end
25
27
  end
26
28
  alias_method :get, :[]
@@ -49,7 +51,10 @@ module Opener
49
51
  lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
50
52
  mapping = Hash.new{ |hash, key| hash[key] = [] }
51
53
  lexicons.each do |l|
52
- mapping[l.lemma.to_sym] << l.aspect
54
+ mapping[l.lemma.to_sym] << l
55
+ l.variants&.each do |v|
56
+ mapping[v.lemma.to_sym] << l
57
+ end
53
58
  end
54
59
 
55
60
  mapping
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.5'
4
+ VERSION = '3.4.2'
5
5
 
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.5
4
+ version: 3.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-10 00:00:00.000000000 Z
11
+ date: 2021-09-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons
@@ -212,8 +212,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
212
212
  - !ruby/object:Gem::Version
213
213
  version: '0'
214
214
  requirements: []
215
- rubyforge_project:
216
- rubygems_version: 2.7.6
215
+ rubygems_version: 3.2.14
217
216
  signing_key:
218
217
  specification_version: 4
219
218
  summary: Property tagger for hotels in Dutch and English.