opener-property-tagger 3.3.3 → 3.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 200701c1c3f256b0fbfa6fb284e321a16d10f399ced1fa750f7f9d4b66ed5a6f
4
- data.tar.gz: 6da856c2c5beb56d778f12e93db99c6e5109728c1c89a6a8f8cceec488a556d2
3
+ metadata.gz: cd341e6aded0a9e7690a95a6d56ad68fd530df0b55458a54e3831b0b3a743393
4
+ data.tar.gz: 1ecc43a166b60c8ee56dcd5345efbbb792acfac285a4c2b1d1448c0b5aa91c79
5
5
  SHA512:
6
- metadata.gz: 5d7056c547b3c7b845b4d3ad7d9f43019629c57e447b7ba5d10ada342809e708f4b62a6ac468daef61617a62e60b35391b541e9f393722d26590b531f6292559
7
- data.tar.gz: 7a79912c1317fb629c95244c8f0bb9cf73690bf3b194984c8a7c818a6cc3123f8eaa5957d6bbe32db01f000457472a9db91ba7f1c3b8b556fdfcd440ff033c1a
6
+ metadata.gz: efb8fb35c77c6886d01929d04e29d590e25ff0ecb70e98110c85c221d4e47557bca951793ea278a994f48660e1016c4c1df08e13c552f7b601b8fb65734f3293
7
+ data.tar.gz: 6f51528b4be7f5eb5985720914ac555066f7105832e8459db68eb729dfa4d45b8023751f491ce06776ca9af784f27951e6225246c1305324ce2c539452940110
@@ -1,6 +1,6 @@
1
1
  require 'open3'
2
2
  require 'slop'
3
- require 'oga'
3
+ require 'nokogiri'
4
4
  require 'monitor'
5
5
  require 'httpclient'
6
6
  require 'hashie'
@@ -33,12 +33,17 @@ module Opener
33
33
  # @param [String] path
34
34
  #
35
35
  def load_aspects(path)
36
- mapping = Hash.new { |hash, key| hash[key] = [] }
36
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
37
37
 
38
- File.foreach(path) do |line|
38
+ File.foreach path do |line|
39
39
  lemma, pos, aspect = line.chomp.split("\t")
40
+ l = Hashie::Mash.new(
41
+ lemma: lemma,
42
+ pos: pos,
43
+ aspect: aspect,
44
+ )
40
45
 
41
- mapping[lemma.to_sym] << aspect
46
+ mapping[l.lemma.to_sym] << l
42
47
  end
43
48
 
44
49
  return mapping
@@ -6,7 +6,8 @@ module Opener
6
6
  class Processor
7
7
 
8
8
  attr_accessor :document
9
- attr_accessor :aspects, :aspects_path, :aspects_url
9
+ attr_accessor :aspects_path, :aspects_url
10
+ attr_accessor :aspects, :lexicons
10
11
  attr_accessor :timestamp, :pretty
11
12
 
12
13
  ##
@@ -25,18 +26,19 @@ module Opener
25
26
  # by default due to the performance overhead.
26
27
  #
27
28
  def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
28
- @document = Oga.parse_xml file
29
+ @document = Nokogiri.XML file
29
30
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
30
31
  @timestamp = timestamp
31
32
  @pretty = pretty
32
33
 
33
34
  @params = params
34
- @cache_keys = params[:cache_keys] || {lang: language}
35
35
  @remote = !url.nil?
36
36
  @aspects_path = path
37
37
  @aspects_url = url
38
+ @cache_keys = params[:cache_keys] || {}
39
+ @cache_keys.merge! lang: @document.root.attr('xml:lang')
38
40
 
39
- @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
+ @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
40
42
  end
41
43
 
42
44
  ##
@@ -44,83 +46,79 @@ module Opener
44
46
  # @return [String]
45
47
  #
46
48
  def process
47
- existing_aspects = extract_aspects
48
-
49
49
  add_features_layer
50
50
  add_properties_layer
51
51
 
52
- existing_aspects.each_with_index do |(key, value), index|
52
+ extract_aspects.each.with_index do |(lemma, values), index|
53
53
  index += 1
54
54
 
55
- add_property(key, value, index)
55
+ add_property lemma, values, index
56
56
  end
57
57
 
58
58
  add_linguistic_processor
59
59
 
60
- return pretty ? pretty_print(document) : document.to_xml
60
+ pretty ? pretty_print(document) : document.to_xml
61
61
  end
62
62
 
63
- ##
64
- # Get the language of the input file.
65
- #
66
- # @return [String]
67
- #
68
63
  def language
69
- return @language ||= document.at_xpath('KAF').get('xml:lang')
64
+ @language ||= document.at_xpath('KAF').attr('xml:lang')
70
65
  end
71
66
 
72
- ##
73
- # Get the terms from the input file
74
- # @return [Hash]
75
- #
76
67
  def terms
77
68
  unless @terms
78
69
  @terms = {}
79
70
 
80
71
  document.xpath('KAF/terms/term').each do |term|
81
- @terms[term.get('tid').to_sym] = term.get('lemma')
72
+ @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
82
73
  end
83
74
  end
84
75
 
85
- return @terms
76
+ @terms
86
77
  end
87
78
 
79
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
80
+ # lemmas) belong to a property.
81
+ MAX_NGRAM = 2
82
+
88
83
  ##
89
84
  # Check which terms belong to an aspect (property)
85
+ # Text have priority over Lemmas, overriding if there is a conflict
90
86
  # @return [Hash]
91
87
  #
92
88
  def extract_aspects
93
- term_ids = terms.keys
94
- lemmas = terms.values
95
-
96
- current_token = 0
97
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
98
- # lemmas) belong to a property.
99
- max_ngram = 2
100
-
101
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
102
-
103
- while current_token < terms.count
104
- (0..max_ngram).each do |tam_ngram|
105
- if current_token + tam_ngram <= terms.count
106
- ngram = lemmas[current_token..current_token+tam_ngram].join(" ").downcase
107
-
108
- if aspects[ngram.to_sym]
109
- properties = aspects[ngram.to_sym]
110
- ids = term_ids[current_token..current_token+tam_ngram]
111
-
112
- properties.uniq.each do |property|
113
- next if !property or property.strip.empty?
114
-
115
- uniq_aspects[property.to_sym] << [ids,ngram]
89
+ all_term_ids = terms.keys
90
+ lemmas = terms.values
91
+ uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
92
+
93
+ [:lemma, :text].each do |k|
94
+ current_token = 0
95
+
96
+ while current_token < terms.count
97
+ (0..MAX_NGRAM).each do |tam_ngram|
98
+ next unless current_token + tam_ngram <= terms.count
99
+
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
101
+
102
+ @lexicons[ngram.to_sym]&.each do |l|
103
+ properties = if l.aspects.present? then l.aspects else [l.aspect] end
104
+ properties.each do |p|
105
+ next if p.blank?
106
+ term_ids = all_term_ids[current_token..current_token+tam_ngram]
107
+ next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
108
+
109
+ uniq_aspects[p.to_sym] << Hashie::Mash.new(
110
+ term_ids: term_ids,
111
+ ngram: ngram,
112
+ lexicon: l,
113
+ )
116
114
  end
117
115
  end
118
116
  end
117
+ current_token += 1
119
118
  end
120
- current_token += 1
121
119
  end
122
120
 
123
- return Hash[uniq_aspects.sort]
121
+ Hash[uniq_aspects.sort]
124
122
  end
125
123
 
126
124
  ##
@@ -140,25 +138,25 @@ module Opener
140
138
  new_node("properties", "KAF/features")
141
139
  end
142
140
 
143
- def add_property(key, value, index)
141
+ def add_property lemma, values, index
144
142
  property_node = new_node("property", "KAF/features/properties")
145
143
 
146
- property_node.set('lemma', key.to_s)
147
- property_node.set('pid', "p#{index.to_s}")
144
+ property_node['lemma'] = lemma.to_s
145
+ property_node['pid'] = "p#{index.to_s}"
148
146
 
149
147
  references_node = new_node("references", property_node)
150
148
 
151
- value.uniq.each do |v|
152
- comment = Oga::XML::Comment.new(:text => " #{v.last} ")
153
-
154
- references_node.children << comment
149
+ values.each do |v|
150
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
151
+ references_node.add_child comm_node
155
152
 
156
- span_node = new_node("span", references_node)
153
+ span_node = new_node 'span', references_node
157
154
 
158
- v.first.each do |val|
159
- target_node = new_node("target", span_node)
155
+ v.term_ids.each do |id|
156
+ target_node = new_node 'target', span_node
160
157
 
161
- target_node.set('id', val.to_s)
158
+ target_node['id'] = id.to_s
159
+ target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
162
160
  end
163
161
  end
164
162
  end
@@ -169,19 +167,19 @@ module Opener
169
167
  version = '2.0'
170
168
 
171
169
  node = new_node('linguisticProcessors', 'KAF/kafHeader')
172
- node.set('layer', 'features')
170
+ node['layer'] = 'features'
173
171
 
174
172
  lp_node = new_node('lp', node)
175
173
 
176
- lp_node.set('version', "#{last_edited}-#{version}")
177
- lp_node.set('name', description)
174
+ lp_node['version'] = "#{last_edited}-#{version}"
175
+ lp_node['name'] = description
178
176
 
179
177
  if timestamp
180
178
  format = '%Y-%m-%dT%H:%M:%S%Z'
181
179
 
182
- lp_node.set('timestamp', Time.now.strftime(format))
180
+ lp_node['timestamp'] = Time.now.strftime(format)
183
181
  else
184
- lp_node.set('timestamp', '*')
182
+ lp_node['timestamp'] = '*'
185
183
  end
186
184
  end
187
185
 
@@ -200,7 +198,7 @@ module Opener
200
198
  formatter.compact = true
201
199
  formatter.write(doc, out)
202
200
 
203
- return out.strip
201
+ out.strip
204
202
  end
205
203
 
206
204
  protected
@@ -212,11 +210,11 @@ module Opener
212
210
  parent_node = parent
213
211
  end
214
212
 
215
- node = Oga::XML::Element.new(:name => tag)
213
+ node = Nokogiri::XML::Element.new(tag, document)
216
214
 
217
- parent_node.children << node
215
+ parent_node.add_child node
218
216
 
219
- return node
217
+ node
220
218
  end
221
219
 
222
220
  ##
@@ -224,7 +222,7 @@ module Opener
224
222
  # @return [Boolean]
225
223
  #
226
224
  def is_kaf?
227
- return !!document.at_xpath('KAF')
225
+ !!document.at_xpath('KAF')
228
226
  end
229
227
 
230
228
  ##
@@ -17,10 +17,11 @@ module Opener
17
17
  end
18
18
 
19
19
  def [] **params
20
+ existing = @cache[params]
21
+ return existing if existing and existing.from > UPDATE_INTERVAL.ago
22
+
20
23
  synchronize do
21
- existing = @cache[params]
22
- break existing if existing and existing.from > UPDATE_INTERVAL.ago
23
- @cache[params] = cache_update existing, **params
24
+ @cache[params] = cache_update @cache[params], **params
24
25
  end
25
26
  end
26
27
  alias_method :get, :[]
@@ -49,7 +50,10 @@ module Opener
49
50
  lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
50
51
  mapping = Hash.new{ |hash, key| hash[key] = [] }
51
52
  lexicons.each do |l|
52
- mapping[l.lemma.to_sym] << l.aspect
53
+ mapping[l.lemma.to_sym] << l
54
+ l.variants&.each do |v|
55
+ mapping[v.lemma.to_sym] << l
56
+ end
53
57
  end
54
58
 
55
59
  mapping
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.3'
4
+ VERSION = '3.4.1'
5
5
 
6
6
  end
7
7
  end
@@ -28,7 +28,7 @@ Gem::Specification.new do |gem|
28
28
  gem.add_dependency 'opener-webservice', '~> 2.1'
29
29
  gem.add_dependency 'opener-core', '~> 2.2'
30
30
 
31
- gem.add_dependency 'oga', ['~> 1.0', '>= 1.3.1']
31
+ gem.add_dependency 'nokogiri'
32
32
  gem.add_dependency 'httpclient'
33
33
  gem.add_dependency 'hashie'
34
34
  gem.add_dependency 'activesupport'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.3
4
+ version: 3.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-02 00:00:00.000000000 Z
11
+ date: 2021-02-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons
@@ -53,25 +53,19 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '2.2'
55
55
  - !ruby/object:Gem::Dependency
56
- name: oga
56
+ name: nokogiri
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: '1.0'
62
59
  - - ">="
63
60
  - !ruby/object:Gem::Version
64
- version: 1.3.1
61
+ version: '0'
65
62
  type: :runtime
66
63
  prerelease: false
67
64
  version_requirements: !ruby/object:Gem::Requirement
68
65
  requirements:
69
- - - "~>"
70
- - !ruby/object:Gem::Version
71
- version: '1.0'
72
66
  - - ">="
73
67
  - !ruby/object:Gem::Version
74
- version: 1.3.1
68
+ version: '0'
75
69
  - !ruby/object:Gem::Dependency
76
70
  name: httpclient
77
71
  requirement: !ruby/object:Gem::Requirement
@@ -173,9 +167,9 @@ dependencies:
173
167
  description: Property tagger for hotels in Dutch and English.
174
168
  email:
175
169
  executables:
176
- - property-tagger-server
177
170
  - property-tagger
178
171
  - property-tagger-daemon
172
+ - property-tagger-server
179
173
  extensions: []
180
174
  extra_rdoc_files: []
181
175
  files: