opener-property-tagger 3.3.3 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 200701c1c3f256b0fbfa6fb284e321a16d10f399ced1fa750f7f9d4b66ed5a6f
4
- data.tar.gz: 6da856c2c5beb56d778f12e93db99c6e5109728c1c89a6a8f8cceec488a556d2
3
+ metadata.gz: cd341e6aded0a9e7690a95a6d56ad68fd530df0b55458a54e3831b0b3a743393
4
+ data.tar.gz: 1ecc43a166b60c8ee56dcd5345efbbb792acfac285a4c2b1d1448c0b5aa91c79
5
5
  SHA512:
6
- metadata.gz: 5d7056c547b3c7b845b4d3ad7d9f43019629c57e447b7ba5d10ada342809e708f4b62a6ac468daef61617a62e60b35391b541e9f393722d26590b531f6292559
7
- data.tar.gz: 7a79912c1317fb629c95244c8f0bb9cf73690bf3b194984c8a7c818a6cc3123f8eaa5957d6bbe32db01f000457472a9db91ba7f1c3b8b556fdfcd440ff033c1a
6
+ metadata.gz: efb8fb35c77c6886d01929d04e29d590e25ff0ecb70e98110c85c221d4e47557bca951793ea278a994f48660e1016c4c1df08e13c552f7b601b8fb65734f3293
7
+ data.tar.gz: 6f51528b4be7f5eb5985720914ac555066f7105832e8459db68eb729dfa4d45b8023751f491ce06776ca9af784f27951e6225246c1305324ce2c539452940110
@@ -1,6 +1,6 @@
1
1
  require 'open3'
2
2
  require 'slop'
3
- require 'oga'
3
+ require 'nokogiri'
4
4
  require 'monitor'
5
5
  require 'httpclient'
6
6
  require 'hashie'
@@ -33,12 +33,17 @@ module Opener
33
33
  # @param [String] path
34
34
  #
35
35
  def load_aspects(path)
36
- mapping = Hash.new { |hash, key| hash[key] = [] }
36
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
37
37
 
38
- File.foreach(path) do |line|
38
+ File.foreach path do |line|
39
39
  lemma, pos, aspect = line.chomp.split("\t")
40
+ l = Hashie::Mash.new(
41
+ lemma: lemma,
42
+ pos: pos,
43
+ aspect: aspect,
44
+ )
40
45
 
41
- mapping[lemma.to_sym] << aspect
46
+ mapping[l.lemma.to_sym] << l
42
47
  end
43
48
 
44
49
  return mapping
@@ -6,7 +6,8 @@ module Opener
6
6
  class Processor
7
7
 
8
8
  attr_accessor :document
9
- attr_accessor :aspects, :aspects_path, :aspects_url
9
+ attr_accessor :aspects_path, :aspects_url
10
+ attr_accessor :aspects, :lexicons
10
11
  attr_accessor :timestamp, :pretty
11
12
 
12
13
  ##
@@ -25,18 +26,19 @@ module Opener
25
26
  # by default due to the performance overhead.
26
27
  #
27
28
  def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
28
- @document = Oga.parse_xml file
29
+ @document = Nokogiri.XML file
29
30
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
30
31
  @timestamp = timestamp
31
32
  @pretty = pretty
32
33
 
33
34
  @params = params
34
- @cache_keys = params[:cache_keys] || {lang: language}
35
35
  @remote = !url.nil?
36
36
  @aspects_path = path
37
37
  @aspects_url = url
38
+ @cache_keys = params[:cache_keys] || {}
39
+ @cache_keys.merge! lang: @document.root.attr('xml:lang')
38
40
 
39
- @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
+ @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
40
42
  end
41
43
 
42
44
  ##
@@ -44,83 +46,79 @@ module Opener
44
46
  # @return [String]
45
47
  #
46
48
  def process
47
- existing_aspects = extract_aspects
48
-
49
49
  add_features_layer
50
50
  add_properties_layer
51
51
 
52
- existing_aspects.each_with_index do |(key, value), index|
52
+ extract_aspects.each.with_index do |(lemma, values), index|
53
53
  index += 1
54
54
 
55
- add_property(key, value, index)
55
+ add_property lemma, values, index
56
56
  end
57
57
 
58
58
  add_linguistic_processor
59
59
 
60
- return pretty ? pretty_print(document) : document.to_xml
60
+ pretty ? pretty_print(document) : document.to_xml
61
61
  end
62
62
 
63
- ##
64
- # Get the language of the input file.
65
- #
66
- # @return [String]
67
- #
68
63
  def language
69
- return @language ||= document.at_xpath('KAF').get('xml:lang')
64
+ @language ||= document.at_xpath('KAF').attr('xml:lang')
70
65
  end
71
66
 
72
- ##
73
- # Get the terms from the input file
74
- # @return [Hash]
75
- #
76
67
  def terms
77
68
  unless @terms
78
69
  @terms = {}
79
70
 
80
71
  document.xpath('KAF/terms/term').each do |term|
81
- @terms[term.get('tid').to_sym] = term.get('lemma')
72
+ @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
82
73
  end
83
74
  end
84
75
 
85
- return @terms
76
+ @terms
86
77
  end
87
78
 
79
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
80
+ # lemmas) belong to a property.
81
+ MAX_NGRAM = 2
82
+
88
83
  ##
89
84
  # Check which terms belong to an aspect (property)
85
+ # Text have priority over Lemmas, overriding if there is a conflict
90
86
  # @return [Hash]
91
87
  #
92
88
  def extract_aspects
93
- term_ids = terms.keys
94
- lemmas = terms.values
95
-
96
- current_token = 0
97
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
98
- # lemmas) belong to a property.
99
- max_ngram = 2
100
-
101
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
102
-
103
- while current_token < terms.count
104
- (0..max_ngram).each do |tam_ngram|
105
- if current_token + tam_ngram <= terms.count
106
- ngram = lemmas[current_token..current_token+tam_ngram].join(" ").downcase
107
-
108
- if aspects[ngram.to_sym]
109
- properties = aspects[ngram.to_sym]
110
- ids = term_ids[current_token..current_token+tam_ngram]
111
-
112
- properties.uniq.each do |property|
113
- next if !property or property.strip.empty?
114
-
115
- uniq_aspects[property.to_sym] << [ids,ngram]
89
+ all_term_ids = terms.keys
90
+ lemmas = terms.values
91
+ uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
92
+
93
+ [:lemma, :text].each do |k|
94
+ current_token = 0
95
+
96
+ while current_token < terms.count
97
+ (0..MAX_NGRAM).each do |tam_ngram|
98
+ next unless current_token + tam_ngram <= terms.count
99
+
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
101
+
102
+ @lexicons[ngram.to_sym]&.each do |l|
103
+ properties = if l.aspects.present? then l.aspects else [l.aspect] end
104
+ properties.each do |p|
105
+ next if p.blank?
106
+ term_ids = all_term_ids[current_token..current_token+tam_ngram]
107
+ next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
108
+
109
+ uniq_aspects[p.to_sym] << Hashie::Mash.new(
110
+ term_ids: term_ids,
111
+ ngram: ngram,
112
+ lexicon: l,
113
+ )
116
114
  end
117
115
  end
118
116
  end
117
+ current_token += 1
119
118
  end
120
- current_token += 1
121
119
  end
122
120
 
123
- return Hash[uniq_aspects.sort]
121
+ Hash[uniq_aspects.sort]
124
122
  end
125
123
 
126
124
  ##
@@ -140,25 +138,25 @@ module Opener
140
138
  new_node("properties", "KAF/features")
141
139
  end
142
140
 
143
- def add_property(key, value, index)
141
+ def add_property lemma, values, index
144
142
  property_node = new_node("property", "KAF/features/properties")
145
143
 
146
- property_node.set('lemma', key.to_s)
147
- property_node.set('pid', "p#{index.to_s}")
144
+ property_node['lemma'] = lemma.to_s
145
+ property_node['pid'] = "p#{index.to_s}"
148
146
 
149
147
  references_node = new_node("references", property_node)
150
148
 
151
- value.uniq.each do |v|
152
- comment = Oga::XML::Comment.new(:text => " #{v.last} ")
153
-
154
- references_node.children << comment
149
+ values.each do |v|
150
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
151
+ references_node.add_child comm_node
155
152
 
156
- span_node = new_node("span", references_node)
153
+ span_node = new_node 'span', references_node
157
154
 
158
- v.first.each do |val|
159
- target_node = new_node("target", span_node)
155
+ v.term_ids.each do |id|
156
+ target_node = new_node 'target', span_node
160
157
 
161
- target_node.set('id', val.to_s)
158
+ target_node['id'] = id.to_s
159
+ target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
162
160
  end
163
161
  end
164
162
  end
@@ -169,19 +167,19 @@ module Opener
169
167
  version = '2.0'
170
168
 
171
169
  node = new_node('linguisticProcessors', 'KAF/kafHeader')
172
- node.set('layer', 'features')
170
+ node['layer'] = 'features'
173
171
 
174
172
  lp_node = new_node('lp', node)
175
173
 
176
- lp_node.set('version', "#{last_edited}-#{version}")
177
- lp_node.set('name', description)
174
+ lp_node['version'] = "#{last_edited}-#{version}"
175
+ lp_node['name'] = description
178
176
 
179
177
  if timestamp
180
178
  format = '%Y-%m-%dT%H:%M:%S%Z'
181
179
 
182
- lp_node.set('timestamp', Time.now.strftime(format))
180
+ lp_node['timestamp'] = Time.now.strftime(format)
183
181
  else
184
- lp_node.set('timestamp', '*')
182
+ lp_node['timestamp'] = '*'
185
183
  end
186
184
  end
187
185
 
@@ -200,7 +198,7 @@ module Opener
200
198
  formatter.compact = true
201
199
  formatter.write(doc, out)
202
200
 
203
- return out.strip
201
+ out.strip
204
202
  end
205
203
 
206
204
  protected
@@ -212,11 +210,11 @@ module Opener
212
210
  parent_node = parent
213
211
  end
214
212
 
215
- node = Oga::XML::Element.new(:name => tag)
213
+ node = Nokogiri::XML::Element.new(tag, document)
216
214
 
217
- parent_node.children << node
215
+ parent_node.add_child node
218
216
 
219
- return node
217
+ node
220
218
  end
221
219
 
222
220
  ##
@@ -224,7 +222,7 @@ module Opener
224
222
  # @return [Boolean]
225
223
  #
226
224
  def is_kaf?
227
- return !!document.at_xpath('KAF')
225
+ !!document.at_xpath('KAF')
228
226
  end
229
227
 
230
228
  ##
@@ -17,10 +17,11 @@ module Opener
17
17
  end
18
18
 
19
19
  def [] **params
20
+ existing = @cache[params]
21
+ return existing if existing and existing.from > UPDATE_INTERVAL.ago
22
+
20
23
  synchronize do
21
- existing = @cache[params]
22
- break existing if existing and existing.from > UPDATE_INTERVAL.ago
23
- @cache[params] = cache_update existing, **params
24
+ @cache[params] = cache_update @cache[params], **params
24
25
  end
25
26
  end
26
27
  alias_method :get, :[]
@@ -49,7 +50,10 @@ module Opener
49
50
  lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
50
51
  mapping = Hash.new{ |hash, key| hash[key] = [] }
51
52
  lexicons.each do |l|
52
- mapping[l.lemma.to_sym] << l.aspect
53
+ mapping[l.lemma.to_sym] << l
54
+ l.variants&.each do |v|
55
+ mapping[v.lemma.to_sym] << l
56
+ end
53
57
  end
54
58
 
55
59
  mapping
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.3'
4
+ VERSION = '3.4.1'
5
5
 
6
6
  end
7
7
  end
@@ -28,7 +28,7 @@ Gem::Specification.new do |gem|
28
28
  gem.add_dependency 'opener-webservice', '~> 2.1'
29
29
  gem.add_dependency 'opener-core', '~> 2.2'
30
30
 
31
- gem.add_dependency 'oga', ['~> 1.0', '>= 1.3.1']
31
+ gem.add_dependency 'nokogiri'
32
32
  gem.add_dependency 'httpclient'
33
33
  gem.add_dependency 'hashie'
34
34
  gem.add_dependency 'activesupport'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.3
4
+ version: 3.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-02 00:00:00.000000000 Z
11
+ date: 2021-02-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons
@@ -53,25 +53,19 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '2.2'
55
55
  - !ruby/object:Gem::Dependency
56
- name: oga
56
+ name: nokogiri
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: '1.0'
62
59
  - - ">="
63
60
  - !ruby/object:Gem::Version
64
- version: 1.3.1
61
+ version: '0'
65
62
  type: :runtime
66
63
  prerelease: false
67
64
  version_requirements: !ruby/object:Gem::Requirement
68
65
  requirements:
69
- - - "~>"
70
- - !ruby/object:Gem::Version
71
- version: '1.0'
72
66
  - - ">="
73
67
  - !ruby/object:Gem::Version
74
- version: 1.3.1
68
+ version: '0'
75
69
  - !ruby/object:Gem::Dependency
76
70
  name: httpclient
77
71
  requirement: !ruby/object:Gem::Requirement
@@ -173,9 +167,9 @@ dependencies:
173
167
  description: Property tagger for hotels in Dutch and English.
174
168
  email:
175
169
  executables:
176
- - property-tagger-server
177
170
  - property-tagger
178
171
  - property-tagger-daemon
172
+ - property-tagger-server
179
173
  extensions: []
180
174
  extra_rdoc_files: []
181
175
  files: