opener-property-tagger 3.3.1 → 3.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d732e35937583e6fbad1452c2dc72f894df668979a9259c19f4a84a9557b8971
4
- data.tar.gz: 6999b3814921c0ec9cefd59b096c89b18c4ac4e9c5a9529f0fa3141c463ea306
3
+ metadata.gz: 0bdc478005838e629b52e310437c5cac89158b8f33429d477c7096f1210c3249
4
+ data.tar.gz: a8fec4e6229cfd39f1cc0f11c3a8007c2c40442fd6ed2f00a558d2cbd8f01c8d
5
5
  SHA512:
6
- metadata.gz: 0d1d7714bfeab24e5505ea3d442b423ab3fe6daa26dfe214b412add729e2544543d0ccadf078bec3d852bb3fa2b1ec541b131a064b9a2598e38bebbeb58a6061
7
- data.tar.gz: 24a4b3b4dcd85edd9249b16e79673487a174c58e7c826820c4b1a67aed56a8f92ec4e228e0d773598b3ed6ce14b5196a2163b48c63646d3442014fcfc2af561c
6
+ metadata.gz: a88d97bd1fff6f57c956024d7d47affe13179a02e068129ddda1be0c577324eafee18c271e465d68dc06a99d7cdd78288e6094f307ab4503ee68c963e618bafc
7
+ data.tar.gz: 6e601c83e09931821e6d00c03114fbaabb72f8c72781f667eea384b7beb916adc3ccc42f83dbd15434779e2dd3e5f574bcb8ef7be022f3154f0afdabec0107b2
@@ -1,6 +1,6 @@
1
1
  require 'open3'
2
2
  require 'slop'
3
- require 'oga'
3
+ require 'nokogiri'
4
4
  require 'monitor'
5
5
  require 'httpclient'
6
6
  require 'hashie'
@@ -36,7 +36,7 @@ module Opener
36
36
  mapping = Hash.new { |hash, key| hash[key] = [] }
37
37
 
38
38
  File.foreach(path) do |line|
39
- lemma, pos, aspect = line.chomp.split("\t")
39
+ lemma, _pos, aspect = line.chomp.split("\t")
40
40
 
41
41
  mapping[lemma.to_sym] << aspect
42
42
  end
@@ -25,16 +25,17 @@ module Opener
25
25
  # by default due to the performance overhead.
26
26
  #
27
27
  def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
28
- @document = Oga.parse_xml file
28
+ @document = Nokogiri.XML file
29
29
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
30
30
  @timestamp = timestamp
31
31
  @pretty = pretty
32
32
 
33
33
  @params = params
34
- @cache_keys = params[:cache_keys] || {lang: language}
35
34
  @remote = !url.nil?
36
35
  @aspects_path = path
37
36
  @aspects_url = url
37
+ @cache_keys = params[:cache_keys]
38
+ @cache_keys.merge! lang: @document.root.attr('xml:lang')
38
39
 
39
40
  @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
40
41
  end
@@ -57,70 +58,64 @@ module Opener
57
58
 
58
59
  add_linguistic_processor
59
60
 
60
- return pretty ? pretty_print(document) : document.to_xml
61
+ pretty ? pretty_print(document) : document.to_xml
61
62
  end
62
63
 
63
- ##
64
- # Get the language of the input file.
65
- #
66
- # @return [String]
67
- #
68
64
  def language
69
- return @language ||= document.at_xpath('KAF').get('xml:lang')
65
+ @language ||= document.at_xpath('KAF').attr('xml:lang')
70
66
  end
71
67
 
72
- ##
73
- # Get the terms from the input file
74
- # @return [Hash]
75
- #
76
68
  def terms
77
69
  unless @terms
78
70
  @terms = {}
79
71
 
80
72
  document.xpath('KAF/terms/term').each do |term|
81
- @terms[term.get('tid').to_sym] = term.get('lemma')
73
+ @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
82
74
  end
83
75
  end
84
76
 
85
- return @terms
77
+ @terms
86
78
  end
87
79
 
88
80
  ##
89
81
  # Check which terms belong to an aspect (property)
82
+ # Text have priority over Lemmas, overriding if there is a conflict
90
83
  # @return [Hash]
91
84
  #
92
85
  def extract_aspects
93
- term_ids = terms.keys
94
- lemmas = terms.values
86
+ term_ids = terms.keys
87
+ lemmas = terms.values
88
+ uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
95
89
 
96
- current_token = 0
97
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
98
- # lemmas) belong to a property.
99
- max_ngram = 2
90
+ [:lemma, :text].each do |k|
91
+ current_token = 0
92
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
93
+ # lemmas) belong to a property.
94
+ max_ngram = 2
100
95
 
101
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
102
96
 
103
- while current_token < terms.count
104
- (0..max_ngram).each do |tam_ngram|
105
- if current_token + tam_ngram <= terms.count
106
- ngram = lemmas[current_token..current_token+tam_ngram].join(" ").downcase
97
+ while current_token < terms.count
98
+ (0..max_ngram).each do |tam_ngram|
99
+ if current_token + tam_ngram <= terms.count
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{|a| a[k] }.join(" ").downcase
107
101
 
108
- if aspects[ngram.to_sym]
109
- properties = aspects[ngram.to_sym]
110
- ids = term_ids[current_token..current_token+tam_ngram]
102
+ if aspects[ngram.to_sym]
103
+ properties = aspects[ngram.to_sym]
104
+ ids = term_ids[current_token..current_token+tam_ngram]
111
105
 
112
- properties.uniq.each do |property|
113
- next if !property or property.strip.empty?
106
+ properties.uniq.each do |property|
107
+ next if !property or property.strip.empty?
114
108
 
115
- uniq_aspects[property.to_sym] << [ids,ngram]
109
+ uniq_aspects[property.to_sym] << [ids,ngram] unless uniq_aspects[property.to_sym].include? [ids,ngram]
110
+ end
116
111
  end
117
112
  end
118
113
  end
114
+ current_token += 1
119
115
  end
120
- current_token += 1
121
116
  end
122
117
 
123
- return Hash[uniq_aspects.sort]
118
+ Hash[uniq_aspects.sort]
124
119
  end
125
120
 
126
121
  ##
@@ -143,22 +138,21 @@ module Opener
143
138
  def add_property(key, value, index)
144
139
  property_node = new_node("property", "KAF/features/properties")
145
140
 
146
- property_node.set('lemma', key.to_s)
147
- property_node.set('pid', "p#{index.to_s}")
141
+ property_node['lemma'] = key.to_s
142
+ property_node['pid'] = "p#{index.to_s}"
148
143
 
149
144
  references_node = new_node("references", property_node)
150
145
 
151
146
  value.uniq.each do |v|
152
- comment = Oga::XML::Comment.new(:text => " #{v.last} ")
153
-
154
- references_node.children << comment
147
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.last} ")
148
+ references_node.add_child comm_node
155
149
 
156
150
  span_node = new_node("span", references_node)
157
151
 
158
152
  v.first.each do |val|
159
- target_node = new_node("target", span_node)
153
+ target_node = new_node("target", span_node)
160
154
 
161
- target_node.set('id', val.to_s)
155
+ target_node['id'] = val.to_s
162
156
  end
163
157
  end
164
158
  end
@@ -169,19 +163,19 @@ module Opener
169
163
  version = '2.0'
170
164
 
171
165
  node = new_node('linguisticProcessors', 'KAF/kafHeader')
172
- node.set('layer', 'features')
166
+ node['layer'] = 'features'
173
167
 
174
168
  lp_node = new_node('lp', node)
175
169
 
176
- lp_node.set('version', "#{last_edited}-#{version}")
177
- lp_node.set('name', description)
170
+ lp_node['version'] = "#{last_edited}-#{version}"
171
+ lp_node['name'] = description
178
172
 
179
173
  if timestamp
180
174
  format = '%Y-%m-%dT%H:%M:%S%Z'
181
175
 
182
- lp_node.set('timestamp', Time.now.strftime(format))
176
+ lp_node['timestamp'] = Time.now.strftime(format)
183
177
  else
184
- lp_node.set('timestamp', '*')
178
+ lp_node['timestamp'] = '*'
185
179
  end
186
180
  end
187
181
 
@@ -200,7 +194,7 @@ module Opener
200
194
  formatter.compact = true
201
195
  formatter.write(doc, out)
202
196
 
203
- return out.strip
197
+ out.strip
204
198
  end
205
199
 
206
200
  protected
@@ -212,11 +206,11 @@ module Opener
212
206
  parent_node = parent
213
207
  end
214
208
 
215
- node = Oga::XML::Element.new(:name => tag)
209
+ node = Nokogiri::XML::Element.new(tag, document)
216
210
 
217
- parent_node.children << node
211
+ parent_node.add_child node
218
212
 
219
- return node
213
+ node
220
214
  end
221
215
 
222
216
  ##
@@ -224,7 +218,7 @@ module Opener
224
218
  # @return [Boolean]
225
219
  #
226
220
  def is_kaf?
227
- return !!document.at_xpath('KAF')
221
+ !!document.at_xpath('KAF')
228
222
  end
229
223
 
230
224
  ##
@@ -7,6 +7,8 @@ module Opener
7
7
 
8
8
  include MonitorMixin
9
9
 
10
+ UPDATE_INTERVAL = (ENV['CACHE_EXPIRE_MINS']&.to_i || 5).minutes
11
+
10
12
  def initialize
11
13
  super
12
14
 
@@ -16,13 +18,9 @@ module Opener
16
18
 
17
19
  def [] **params
18
20
  synchronize do
19
- if existing = @cache[params]
20
- existing.tap do
21
- Thread.new{ @cache[params] = cache_update existing, **params }
22
- end
23
- else
24
- @cache[params] = cache_update **params
25
- end
21
+ existing = @cache[params]
22
+ break existing if existing and existing.from > UPDATE_INTERVAL.ago
23
+ @cache[params] = cache_update existing, **params
26
24
  end
27
25
  end
28
26
  alias_method :get, :[]
@@ -31,7 +29,11 @@ module Opener
31
29
  from = Time.now
32
30
  lexicons = load_aspects cache: existing, **params
33
31
 
34
- return existing if existing and lexicons.blank?
32
+ if existing and lexicons.blank?
33
+ existing.from = from
34
+ return existing
35
+ end
36
+
35
37
  Hashie::Mash.new(
36
38
  aspects: lexicons,
37
39
  from: from,
@@ -39,8 +41,8 @@ module Opener
39
41
  end
40
42
 
41
43
  def load_aspects lang:, cache:, **params
42
- url = "#{@url}&language_code=#{lang}&#{params.to_query}"
43
- url += "&if_updated_since=#{cache.from.iso8601}" if cache
44
+ url = "#{@url}&language_code=#{lang}&#{params.to_query}"
45
+ url += "&if_updated_since=#{cache.from.utc.iso8601}" if cache
44
46
  puts "#{lang}: loading aspects from #{url}"
45
47
 
46
48
  lexicons = JSON.parse HTTPClient.new.get(url).body
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.1'
4
+ VERSION = '3.3.6'
5
5
 
6
6
  end
7
7
  end
@@ -28,7 +28,7 @@ Gem::Specification.new do |gem|
28
28
  gem.add_dependency 'opener-webservice', '~> 2.1'
29
29
  gem.add_dependency 'opener-core', '~> 2.2'
30
30
 
31
- gem.add_dependency 'oga', ['~> 1.0', '>= 1.3.1']
31
+ gem.add_dependency 'nokogiri'
32
32
  gem.add_dependency 'httpclient'
33
33
  gem.add_dependency 'hashie'
34
34
  gem.add_dependency 'activesupport'
metadata CHANGED
@@ -1,170 +1,164 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.1
4
+ version: 3.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-07 00:00:00.000000000 Z
11
+ date: 2020-12-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
+ name: opener-daemons
14
15
  requirement: !ruby/object:Gem::Requirement
15
16
  requirements:
16
17
  - - "~>"
17
18
  - !ruby/object:Gem::Version
18
19
  version: '2.2'
19
- name: opener-daemons
20
- prerelease: false
21
20
  type: :runtime
21
+ prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.2'
27
27
  - !ruby/object:Gem::Dependency
28
+ name: opener-webservice
28
29
  requirement: !ruby/object:Gem::Requirement
29
30
  requirements:
30
31
  - - "~>"
31
32
  - !ruby/object:Gem::Version
32
33
  version: '2.1'
33
- name: opener-webservice
34
- prerelease: false
35
34
  type: :runtime
35
+ prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '2.1'
41
41
  - !ruby/object:Gem::Dependency
42
+ name: opener-core
42
43
  requirement: !ruby/object:Gem::Requirement
43
44
  requirements:
44
45
  - - "~>"
45
46
  - !ruby/object:Gem::Version
46
47
  version: '2.2'
47
- name: opener-core
48
- prerelease: false
49
48
  type: :runtime
49
+ prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '2.2'
55
55
  - !ruby/object:Gem::Dependency
56
+ name: nokogiri
56
57
  requirement: !ruby/object:Gem::Requirement
57
58
  requirements:
58
- - - "~>"
59
- - !ruby/object:Gem::Version
60
- version: '1.0'
61
59
  - - ">="
62
60
  - !ruby/object:Gem::Version
63
- version: 1.3.1
64
- name: oga
65
- prerelease: false
61
+ version: '0'
66
62
  type: :runtime
63
+ prerelease: false
67
64
  version_requirements: !ruby/object:Gem::Requirement
68
65
  requirements:
69
- - - "~>"
70
- - !ruby/object:Gem::Version
71
- version: '1.0'
72
66
  - - ">="
73
67
  - !ruby/object:Gem::Version
74
- version: 1.3.1
68
+ version: '0'
75
69
  - !ruby/object:Gem::Dependency
70
+ name: httpclient
76
71
  requirement: !ruby/object:Gem::Requirement
77
72
  requirements:
78
73
  - - ">="
79
74
  - !ruby/object:Gem::Version
80
75
  version: '0'
81
- name: httpclient
82
- prerelease: false
83
76
  type: :runtime
77
+ prerelease: false
84
78
  version_requirements: !ruby/object:Gem::Requirement
85
79
  requirements:
86
80
  - - ">="
87
81
  - !ruby/object:Gem::Version
88
82
  version: '0'
89
83
  - !ruby/object:Gem::Dependency
84
+ name: hashie
90
85
  requirement: !ruby/object:Gem::Requirement
91
86
  requirements:
92
87
  - - ">="
93
88
  - !ruby/object:Gem::Version
94
89
  version: '0'
95
- name: hashie
96
- prerelease: false
97
90
  type: :runtime
91
+ prerelease: false
98
92
  version_requirements: !ruby/object:Gem::Requirement
99
93
  requirements:
100
94
  - - ">="
101
95
  - !ruby/object:Gem::Version
102
96
  version: '0'
103
97
  - !ruby/object:Gem::Dependency
98
+ name: activesupport
104
99
  requirement: !ruby/object:Gem::Requirement
105
100
  requirements:
106
101
  - - ">="
107
102
  - !ruby/object:Gem::Version
108
103
  version: '0'
109
- name: activesupport
110
- prerelease: false
111
104
  type: :runtime
105
+ prerelease: false
112
106
  version_requirements: !ruby/object:Gem::Requirement
113
107
  requirements:
114
108
  - - ">="
115
109
  - !ruby/object:Gem::Version
116
110
  version: '0'
117
111
  - !ruby/object:Gem::Dependency
112
+ name: rspec
118
113
  requirement: !ruby/object:Gem::Requirement
119
114
  requirements:
120
115
  - - "~>"
121
116
  - !ruby/object:Gem::Version
122
117
  version: '3.0'
123
- name: rspec
124
- prerelease: false
125
118
  type: :development
119
+ prerelease: false
126
120
  version_requirements: !ruby/object:Gem::Requirement
127
121
  requirements:
128
122
  - - "~>"
129
123
  - !ruby/object:Gem::Version
130
124
  version: '3.0'
131
125
  - !ruby/object:Gem::Dependency
126
+ name: cucumber
132
127
  requirement: !ruby/object:Gem::Requirement
133
128
  requirements:
134
129
  - - ">="
135
130
  - !ruby/object:Gem::Version
136
131
  version: '0'
137
- name: cucumber
138
- prerelease: false
139
132
  type: :development
133
+ prerelease: false
140
134
  version_requirements: !ruby/object:Gem::Requirement
141
135
  requirements:
142
136
  - - ">="
143
137
  - !ruby/object:Gem::Version
144
138
  version: '0'
145
139
  - !ruby/object:Gem::Dependency
140
+ name: rake
146
141
  requirement: !ruby/object:Gem::Requirement
147
142
  requirements:
148
143
  - - ">="
149
144
  - !ruby/object:Gem::Version
150
145
  version: '0'
151
- name: rake
152
- prerelease: false
153
146
  type: :development
147
+ prerelease: false
154
148
  version_requirements: !ruby/object:Gem::Requirement
155
149
  requirements:
156
150
  - - ">="
157
151
  - !ruby/object:Gem::Version
158
152
  version: '0'
159
153
  - !ruby/object:Gem::Dependency
154
+ name: benchmark-ips
160
155
  requirement: !ruby/object:Gem::Requirement
161
156
  requirements:
162
157
  - - "~>"
163
158
  - !ruby/object:Gem::Version
164
159
  version: '2.0'
165
- name: benchmark-ips
166
- prerelease: false
167
160
  type: :development
161
+ prerelease: false
168
162
  version_requirements: !ruby/object:Gem::Requirement
169
163
  requirements:
170
164
  - - "~>"
@@ -219,7 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
219
213
  version: '0'
220
214
  requirements: []
221
215
  rubyforge_project:
222
- rubygems_version: 2.7.9
216
+ rubygems_version: 2.7.8
223
217
  signing_key:
224
218
  specification_version: 4
225
219
  summary: Property tagger for hotels in Dutch and English.