opener-property-tagger 3.3.1 → 3.3.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d732e35937583e6fbad1452c2dc72f894df668979a9259c19f4a84a9557b8971
4
- data.tar.gz: 6999b3814921c0ec9cefd59b096c89b18c4ac4e9c5a9529f0fa3141c463ea306
3
+ metadata.gz: 0bdc478005838e629b52e310437c5cac89158b8f33429d477c7096f1210c3249
4
+ data.tar.gz: a8fec4e6229cfd39f1cc0f11c3a8007c2c40442fd6ed2f00a558d2cbd8f01c8d
5
5
  SHA512:
6
- metadata.gz: 0d1d7714bfeab24e5505ea3d442b423ab3fe6daa26dfe214b412add729e2544543d0ccadf078bec3d852bb3fa2b1ec541b131a064b9a2598e38bebbeb58a6061
7
- data.tar.gz: 24a4b3b4dcd85edd9249b16e79673487a174c58e7c826820c4b1a67aed56a8f92ec4e228e0d773598b3ed6ce14b5196a2163b48c63646d3442014fcfc2af561c
6
+ metadata.gz: a88d97bd1fff6f57c956024d7d47affe13179a02e068129ddda1be0c577324eafee18c271e465d68dc06a99d7cdd78288e6094f307ab4503ee68c963e618bafc
7
+ data.tar.gz: 6e601c83e09931821e6d00c03114fbaabb72f8c72781f667eea384b7beb916adc3ccc42f83dbd15434779e2dd3e5f574bcb8ef7be022f3154f0afdabec0107b2
@@ -1,6 +1,6 @@
1
1
  require 'open3'
2
2
  require 'slop'
3
- require 'oga'
3
+ require 'nokogiri'
4
4
  require 'monitor'
5
5
  require 'httpclient'
6
6
  require 'hashie'
@@ -36,7 +36,7 @@ module Opener
36
36
  mapping = Hash.new { |hash, key| hash[key] = [] }
37
37
 
38
38
  File.foreach(path) do |line|
39
- lemma, pos, aspect = line.chomp.split("\t")
39
+ lemma, _pos, aspect = line.chomp.split("\t")
40
40
 
41
41
  mapping[lemma.to_sym] << aspect
42
42
  end
@@ -25,16 +25,17 @@ module Opener
25
25
  # by default due to the performance overhead.
26
26
  #
27
27
  def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
28
- @document = Oga.parse_xml file
28
+ @document = Nokogiri.XML file
29
29
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
30
30
  @timestamp = timestamp
31
31
  @pretty = pretty
32
32
 
33
33
  @params = params
34
- @cache_keys = params[:cache_keys] || {lang: language}
35
34
  @remote = !url.nil?
36
35
  @aspects_path = path
37
36
  @aspects_url = url
37
+ @cache_keys = params[:cache_keys]
38
+ @cache_keys.merge! lang: @document.root.attr('xml:lang')
38
39
 
39
40
  @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
40
41
  end
@@ -57,70 +58,64 @@ module Opener
57
58
 
58
59
  add_linguistic_processor
59
60
 
60
- return pretty ? pretty_print(document) : document.to_xml
61
+ pretty ? pretty_print(document) : document.to_xml
61
62
  end
62
63
 
63
- ##
64
- # Get the language of the input file.
65
- #
66
- # @return [String]
67
- #
68
64
  def language
69
- return @language ||= document.at_xpath('KAF').get('xml:lang')
65
+ @language ||= document.at_xpath('KAF').attr('xml:lang')
70
66
  end
71
67
 
72
- ##
73
- # Get the terms from the input file
74
- # @return [Hash]
75
- #
76
68
  def terms
77
69
  unless @terms
78
70
  @terms = {}
79
71
 
80
72
  document.xpath('KAF/terms/term').each do |term|
81
- @terms[term.get('tid').to_sym] = term.get('lemma')
73
+ @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
82
74
  end
83
75
  end
84
76
 
85
- return @terms
77
+ @terms
86
78
  end
87
79
 
88
80
  ##
89
81
  # Check which terms belong to an aspect (property)
82
+ # Text have priority over Lemmas, overriding if there is a conflict
90
83
  # @return [Hash]
91
84
  #
92
85
  def extract_aspects
93
- term_ids = terms.keys
94
- lemmas = terms.values
86
+ term_ids = terms.keys
87
+ lemmas = terms.values
88
+ uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
95
89
 
96
- current_token = 0
97
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
98
- # lemmas) belong to a property.
99
- max_ngram = 2
90
+ [:lemma, :text].each do |k|
91
+ current_token = 0
92
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
93
+ # lemmas) belong to a property.
94
+ max_ngram = 2
100
95
 
101
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
102
96
 
103
- while current_token < terms.count
104
- (0..max_ngram).each do |tam_ngram|
105
- if current_token + tam_ngram <= terms.count
106
- ngram = lemmas[current_token..current_token+tam_ngram].join(" ").downcase
97
+ while current_token < terms.count
98
+ (0..max_ngram).each do |tam_ngram|
99
+ if current_token + tam_ngram <= terms.count
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{|a| a[k] }.join(" ").downcase
107
101
 
108
- if aspects[ngram.to_sym]
109
- properties = aspects[ngram.to_sym]
110
- ids = term_ids[current_token..current_token+tam_ngram]
102
+ if aspects[ngram.to_sym]
103
+ properties = aspects[ngram.to_sym]
104
+ ids = term_ids[current_token..current_token+tam_ngram]
111
105
 
112
- properties.uniq.each do |property|
113
- next if !property or property.strip.empty?
106
+ properties.uniq.each do |property|
107
+ next if !property or property.strip.empty?
114
108
 
115
- uniq_aspects[property.to_sym] << [ids,ngram]
109
+ uniq_aspects[property.to_sym] << [ids,ngram] unless uniq_aspects[property.to_sym].include? [ids,ngram]
110
+ end
116
111
  end
117
112
  end
118
113
  end
114
+ current_token += 1
119
115
  end
120
- current_token += 1
121
116
  end
122
117
 
123
- return Hash[uniq_aspects.sort]
118
+ Hash[uniq_aspects.sort]
124
119
  end
125
120
 
126
121
  ##
@@ -143,22 +138,21 @@ module Opener
143
138
  def add_property(key, value, index)
144
139
  property_node = new_node("property", "KAF/features/properties")
145
140
 
146
- property_node.set('lemma', key.to_s)
147
- property_node.set('pid', "p#{index.to_s}")
141
+ property_node['lemma'] = key.to_s
142
+ property_node['pid'] = "p#{index.to_s}"
148
143
 
149
144
  references_node = new_node("references", property_node)
150
145
 
151
146
  value.uniq.each do |v|
152
- comment = Oga::XML::Comment.new(:text => " #{v.last} ")
153
-
154
- references_node.children << comment
147
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.last} ")
148
+ references_node.add_child comm_node
155
149
 
156
150
  span_node = new_node("span", references_node)
157
151
 
158
152
  v.first.each do |val|
159
- target_node = new_node("target", span_node)
153
+ target_node = new_node("target", span_node)
160
154
 
161
- target_node.set('id', val.to_s)
155
+ target_node['id'] = val.to_s
162
156
  end
163
157
  end
164
158
  end
@@ -169,19 +163,19 @@ module Opener
169
163
  version = '2.0'
170
164
 
171
165
  node = new_node('linguisticProcessors', 'KAF/kafHeader')
172
- node.set('layer', 'features')
166
+ node['layer'] = 'features'
173
167
 
174
168
  lp_node = new_node('lp', node)
175
169
 
176
- lp_node.set('version', "#{last_edited}-#{version}")
177
- lp_node.set('name', description)
170
+ lp_node['version'] = "#{last_edited}-#{version}"
171
+ lp_node['name'] = description
178
172
 
179
173
  if timestamp
180
174
  format = '%Y-%m-%dT%H:%M:%S%Z'
181
175
 
182
- lp_node.set('timestamp', Time.now.strftime(format))
176
+ lp_node['timestamp'] = Time.now.strftime(format)
183
177
  else
184
- lp_node.set('timestamp', '*')
178
+ lp_node['timestamp'] = '*'
185
179
  end
186
180
  end
187
181
 
@@ -200,7 +194,7 @@ module Opener
200
194
  formatter.compact = true
201
195
  formatter.write(doc, out)
202
196
 
203
- return out.strip
197
+ out.strip
204
198
  end
205
199
 
206
200
  protected
@@ -212,11 +206,11 @@ module Opener
212
206
  parent_node = parent
213
207
  end
214
208
 
215
- node = Oga::XML::Element.new(:name => tag)
209
+ node = Nokogiri::XML::Element.new(tag, document)
216
210
 
217
- parent_node.children << node
211
+ parent_node.add_child node
218
212
 
219
- return node
213
+ node
220
214
  end
221
215
 
222
216
  ##
@@ -224,7 +218,7 @@ module Opener
224
218
  # @return [Boolean]
225
219
  #
226
220
  def is_kaf?
227
- return !!document.at_xpath('KAF')
221
+ !!document.at_xpath('KAF')
228
222
  end
229
223
 
230
224
  ##
@@ -7,6 +7,8 @@ module Opener
7
7
 
8
8
  include MonitorMixin
9
9
 
10
+ UPDATE_INTERVAL = (ENV['CACHE_EXPIRE_MINS']&.to_i || 5).minutes
11
+
10
12
  def initialize
11
13
  super
12
14
 
@@ -16,13 +18,9 @@ module Opener
16
18
 
17
19
  def [] **params
18
20
  synchronize do
19
- if existing = @cache[params]
20
- existing.tap do
21
- Thread.new{ @cache[params] = cache_update existing, **params }
22
- end
23
- else
24
- @cache[params] = cache_update **params
25
- end
21
+ existing = @cache[params]
22
+ break existing if existing and existing.from > UPDATE_INTERVAL.ago
23
+ @cache[params] = cache_update existing, **params
26
24
  end
27
25
  end
28
26
  alias_method :get, :[]
@@ -31,7 +29,11 @@ module Opener
31
29
  from = Time.now
32
30
  lexicons = load_aspects cache: existing, **params
33
31
 
34
- return existing if existing and lexicons.blank?
32
+ if existing and lexicons.blank?
33
+ existing.from = from
34
+ return existing
35
+ end
36
+
35
37
  Hashie::Mash.new(
36
38
  aspects: lexicons,
37
39
  from: from,
@@ -39,8 +41,8 @@ module Opener
39
41
  end
40
42
 
41
43
  def load_aspects lang:, cache:, **params
42
- url = "#{@url}&language_code=#{lang}&#{params.to_query}"
43
- url += "&if_updated_since=#{cache.from.iso8601}" if cache
44
+ url = "#{@url}&language_code=#{lang}&#{params.to_query}"
45
+ url += "&if_updated_since=#{cache.from.utc.iso8601}" if cache
44
46
  puts "#{lang}: loading aspects from #{url}"
45
47
 
46
48
  lexicons = JSON.parse HTTPClient.new.get(url).body
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.1'
4
+ VERSION = '3.3.6'
5
5
 
6
6
  end
7
7
  end
@@ -28,7 +28,7 @@ Gem::Specification.new do |gem|
28
28
  gem.add_dependency 'opener-webservice', '~> 2.1'
29
29
  gem.add_dependency 'opener-core', '~> 2.2'
30
30
 
31
- gem.add_dependency 'oga', ['~> 1.0', '>= 1.3.1']
31
+ gem.add_dependency 'nokogiri'
32
32
  gem.add_dependency 'httpclient'
33
33
  gem.add_dependency 'hashie'
34
34
  gem.add_dependency 'activesupport'
metadata CHANGED
@@ -1,170 +1,164 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.1
4
+ version: 3.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-07 00:00:00.000000000 Z
11
+ date: 2020-12-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
+ name: opener-daemons
14
15
  requirement: !ruby/object:Gem::Requirement
15
16
  requirements:
16
17
  - - "~>"
17
18
  - !ruby/object:Gem::Version
18
19
  version: '2.2'
19
- name: opener-daemons
20
- prerelease: false
21
20
  type: :runtime
21
+ prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.2'
27
27
  - !ruby/object:Gem::Dependency
28
+ name: opener-webservice
28
29
  requirement: !ruby/object:Gem::Requirement
29
30
  requirements:
30
31
  - - "~>"
31
32
  - !ruby/object:Gem::Version
32
33
  version: '2.1'
33
- name: opener-webservice
34
- prerelease: false
35
34
  type: :runtime
35
+ prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '2.1'
41
41
  - !ruby/object:Gem::Dependency
42
+ name: opener-core
42
43
  requirement: !ruby/object:Gem::Requirement
43
44
  requirements:
44
45
  - - "~>"
45
46
  - !ruby/object:Gem::Version
46
47
  version: '2.2'
47
- name: opener-core
48
- prerelease: false
49
48
  type: :runtime
49
+ prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '2.2'
55
55
  - !ruby/object:Gem::Dependency
56
+ name: nokogiri
56
57
  requirement: !ruby/object:Gem::Requirement
57
58
  requirements:
58
- - - "~>"
59
- - !ruby/object:Gem::Version
60
- version: '1.0'
61
59
  - - ">="
62
60
  - !ruby/object:Gem::Version
63
- version: 1.3.1
64
- name: oga
65
- prerelease: false
61
+ version: '0'
66
62
  type: :runtime
63
+ prerelease: false
67
64
  version_requirements: !ruby/object:Gem::Requirement
68
65
  requirements:
69
- - - "~>"
70
- - !ruby/object:Gem::Version
71
- version: '1.0'
72
66
  - - ">="
73
67
  - !ruby/object:Gem::Version
74
- version: 1.3.1
68
+ version: '0'
75
69
  - !ruby/object:Gem::Dependency
70
+ name: httpclient
76
71
  requirement: !ruby/object:Gem::Requirement
77
72
  requirements:
78
73
  - - ">="
79
74
  - !ruby/object:Gem::Version
80
75
  version: '0'
81
- name: httpclient
82
- prerelease: false
83
76
  type: :runtime
77
+ prerelease: false
84
78
  version_requirements: !ruby/object:Gem::Requirement
85
79
  requirements:
86
80
  - - ">="
87
81
  - !ruby/object:Gem::Version
88
82
  version: '0'
89
83
  - !ruby/object:Gem::Dependency
84
+ name: hashie
90
85
  requirement: !ruby/object:Gem::Requirement
91
86
  requirements:
92
87
  - - ">="
93
88
  - !ruby/object:Gem::Version
94
89
  version: '0'
95
- name: hashie
96
- prerelease: false
97
90
  type: :runtime
91
+ prerelease: false
98
92
  version_requirements: !ruby/object:Gem::Requirement
99
93
  requirements:
100
94
  - - ">="
101
95
  - !ruby/object:Gem::Version
102
96
  version: '0'
103
97
  - !ruby/object:Gem::Dependency
98
+ name: activesupport
104
99
  requirement: !ruby/object:Gem::Requirement
105
100
  requirements:
106
101
  - - ">="
107
102
  - !ruby/object:Gem::Version
108
103
  version: '0'
109
- name: activesupport
110
- prerelease: false
111
104
  type: :runtime
105
+ prerelease: false
112
106
  version_requirements: !ruby/object:Gem::Requirement
113
107
  requirements:
114
108
  - - ">="
115
109
  - !ruby/object:Gem::Version
116
110
  version: '0'
117
111
  - !ruby/object:Gem::Dependency
112
+ name: rspec
118
113
  requirement: !ruby/object:Gem::Requirement
119
114
  requirements:
120
115
  - - "~>"
121
116
  - !ruby/object:Gem::Version
122
117
  version: '3.0'
123
- name: rspec
124
- prerelease: false
125
118
  type: :development
119
+ prerelease: false
126
120
  version_requirements: !ruby/object:Gem::Requirement
127
121
  requirements:
128
122
  - - "~>"
129
123
  - !ruby/object:Gem::Version
130
124
  version: '3.0'
131
125
  - !ruby/object:Gem::Dependency
126
+ name: cucumber
132
127
  requirement: !ruby/object:Gem::Requirement
133
128
  requirements:
134
129
  - - ">="
135
130
  - !ruby/object:Gem::Version
136
131
  version: '0'
137
- name: cucumber
138
- prerelease: false
139
132
  type: :development
133
+ prerelease: false
140
134
  version_requirements: !ruby/object:Gem::Requirement
141
135
  requirements:
142
136
  - - ">="
143
137
  - !ruby/object:Gem::Version
144
138
  version: '0'
145
139
  - !ruby/object:Gem::Dependency
140
+ name: rake
146
141
  requirement: !ruby/object:Gem::Requirement
147
142
  requirements:
148
143
  - - ">="
149
144
  - !ruby/object:Gem::Version
150
145
  version: '0'
151
- name: rake
152
- prerelease: false
153
146
  type: :development
147
+ prerelease: false
154
148
  version_requirements: !ruby/object:Gem::Requirement
155
149
  requirements:
156
150
  - - ">="
157
151
  - !ruby/object:Gem::Version
158
152
  version: '0'
159
153
  - !ruby/object:Gem::Dependency
154
+ name: benchmark-ips
160
155
  requirement: !ruby/object:Gem::Requirement
161
156
  requirements:
162
157
  - - "~>"
163
158
  - !ruby/object:Gem::Version
164
159
  version: '2.0'
165
- name: benchmark-ips
166
- prerelease: false
167
160
  type: :development
161
+ prerelease: false
168
162
  version_requirements: !ruby/object:Gem::Requirement
169
163
  requirements:
170
164
  - - "~>"
@@ -219,7 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
219
213
  version: '0'
220
214
  requirements: []
221
215
  rubyforge_project:
222
- rubygems_version: 2.7.9
216
+ rubygems_version: 2.7.8
223
217
  signing_key:
224
218
  specification_version: 4
225
219
  summary: Property tagger for hotels in Dutch and English.