opener-property-tagger 3.3.1 → 3.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/opener/property_tagger.rb +1 -1
- data/lib/opener/property_tagger/file_aspects_cache.rb +1 -1
- data/lib/opener/property_tagger/processor.rb +45 -51
- data/lib/opener/property_tagger/remote_aspects_cache.rb +12 -10
- data/lib/opener/property_tagger/version.rb +1 -1
- data/opener-property-tagger.gemspec +1 -1
- metadata +27 -33
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0bdc478005838e629b52e310437c5cac89158b8f33429d477c7096f1210c3249
|
4
|
+
data.tar.gz: a8fec4e6229cfd39f1cc0f11c3a8007c2c40442fd6ed2f00a558d2cbd8f01c8d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a88d97bd1fff6f57c956024d7d47affe13179a02e068129ddda1be0c577324eafee18c271e465d68dc06a99d7cdd78288e6094f307ab4503ee68c963e618bafc
|
7
|
+
data.tar.gz: 6e601c83e09931821e6d00c03114fbaabb72f8c72781f667eea384b7beb916adc3ccc42f83dbd15434779e2dd3e5f574bcb8ef7be022f3154f0afdabec0107b2
|
@@ -25,16 +25,17 @@ module Opener
|
|
25
25
|
# by default due to the performance overhead.
|
26
26
|
#
|
27
27
|
def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
|
28
|
-
@document =
|
28
|
+
@document = Nokogiri.XML file
|
29
29
|
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
30
30
|
@timestamp = timestamp
|
31
31
|
@pretty = pretty
|
32
32
|
|
33
33
|
@params = params
|
34
|
-
@cache_keys = params[:cache_keys] || {lang: language}
|
35
34
|
@remote = !url.nil?
|
36
35
|
@aspects_path = path
|
37
36
|
@aspects_url = url
|
37
|
+
@cache_keys = params[:cache_keys]
|
38
|
+
@cache_keys.merge! lang: @document.root.attr('xml:lang')
|
38
39
|
|
39
40
|
@aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
|
40
41
|
end
|
@@ -57,70 +58,64 @@ module Opener
|
|
57
58
|
|
58
59
|
add_linguistic_processor
|
59
60
|
|
60
|
-
|
61
|
+
pretty ? pretty_print(document) : document.to_xml
|
61
62
|
end
|
62
63
|
|
63
|
-
##
|
64
|
-
# Get the language of the input file.
|
65
|
-
#
|
66
|
-
# @return [String]
|
67
|
-
#
|
68
64
|
def language
|
69
|
-
|
65
|
+
@language ||= document.at_xpath('KAF').attr('xml:lang')
|
70
66
|
end
|
71
67
|
|
72
|
-
##
|
73
|
-
# Get the terms from the input file
|
74
|
-
# @return [Hash]
|
75
|
-
#
|
76
68
|
def terms
|
77
69
|
unless @terms
|
78
70
|
@terms = {}
|
79
71
|
|
80
72
|
document.xpath('KAF/terms/term').each do |term|
|
81
|
-
@terms[term.
|
73
|
+
@terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
|
82
74
|
end
|
83
75
|
end
|
84
76
|
|
85
|
-
|
77
|
+
@terms
|
86
78
|
end
|
87
79
|
|
88
80
|
##
|
89
81
|
# Check which terms belong to an aspect (property)
|
82
|
+
# Text have priority over Lemmas, overriding if there is a conflict
|
90
83
|
# @return [Hash]
|
91
84
|
#
|
92
85
|
def extract_aspects
|
93
|
-
term_ids
|
94
|
-
lemmas
|
86
|
+
term_ids = terms.keys
|
87
|
+
lemmas = terms.values
|
88
|
+
uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
|
95
89
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
90
|
+
[:lemma, :text].each do |k|
|
91
|
+
current_token = 0
|
92
|
+
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
93
|
+
# lemmas) belong to a property.
|
94
|
+
max_ngram = 2
|
100
95
|
|
101
|
-
uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
|
102
96
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
97
|
+
while current_token < terms.count
|
98
|
+
(0..max_ngram).each do |tam_ngram|
|
99
|
+
if current_token + tam_ngram <= terms.count
|
100
|
+
ngram = lemmas[current_token..current_token+tam_ngram].map{|a| a[k] }.join(" ").downcase
|
107
101
|
|
108
|
-
|
109
|
-
|
110
|
-
|
102
|
+
if aspects[ngram.to_sym]
|
103
|
+
properties = aspects[ngram.to_sym]
|
104
|
+
ids = term_ids[current_token..current_token+tam_ngram]
|
111
105
|
|
112
|
-
|
113
|
-
|
106
|
+
properties.uniq.each do |property|
|
107
|
+
next if !property or property.strip.empty?
|
114
108
|
|
115
|
-
|
109
|
+
uniq_aspects[property.to_sym] << [ids,ngram] unless uniq_aspects[property.to_sym].include? [ids,ngram]
|
110
|
+
end
|
116
111
|
end
|
117
112
|
end
|
118
113
|
end
|
114
|
+
current_token += 1
|
119
115
|
end
|
120
|
-
current_token += 1
|
121
116
|
end
|
122
117
|
|
123
|
-
|
118
|
+
Hash[uniq_aspects.sort]
|
124
119
|
end
|
125
120
|
|
126
121
|
##
|
@@ -143,22 +138,21 @@ module Opener
|
|
143
138
|
def add_property(key, value, index)
|
144
139
|
property_node = new_node("property", "KAF/features/properties")
|
145
140
|
|
146
|
-
property_node
|
147
|
-
property_node
|
141
|
+
property_node['lemma'] = key.to_s
|
142
|
+
property_node['pid'] = "p#{index.to_s}"
|
148
143
|
|
149
144
|
references_node = new_node("references", property_node)
|
150
145
|
|
151
146
|
value.uniq.each do |v|
|
152
|
-
|
153
|
-
|
154
|
-
references_node.children << comment
|
147
|
+
comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.last} ")
|
148
|
+
references_node.add_child comm_node
|
155
149
|
|
156
150
|
span_node = new_node("span", references_node)
|
157
151
|
|
158
152
|
v.first.each do |val|
|
159
|
-
target_node
|
153
|
+
target_node = new_node("target", span_node)
|
160
154
|
|
161
|
-
target_node
|
155
|
+
target_node['id'] = val.to_s
|
162
156
|
end
|
163
157
|
end
|
164
158
|
end
|
@@ -169,19 +163,19 @@ module Opener
|
|
169
163
|
version = '2.0'
|
170
164
|
|
171
165
|
node = new_node('linguisticProcessors', 'KAF/kafHeader')
|
172
|
-
node
|
166
|
+
node['layer'] = 'features'
|
173
167
|
|
174
168
|
lp_node = new_node('lp', node)
|
175
169
|
|
176
|
-
lp_node
|
177
|
-
lp_node
|
170
|
+
lp_node['version'] = "#{last_edited}-#{version}"
|
171
|
+
lp_node['name'] = description
|
178
172
|
|
179
173
|
if timestamp
|
180
174
|
format = '%Y-%m-%dT%H:%M:%S%Z'
|
181
175
|
|
182
|
-
lp_node
|
176
|
+
lp_node['timestamp'] = Time.now.strftime(format)
|
183
177
|
else
|
184
|
-
lp_node
|
178
|
+
lp_node['timestamp'] = '*'
|
185
179
|
end
|
186
180
|
end
|
187
181
|
|
@@ -200,7 +194,7 @@ module Opener
|
|
200
194
|
formatter.compact = true
|
201
195
|
formatter.write(doc, out)
|
202
196
|
|
203
|
-
|
197
|
+
out.strip
|
204
198
|
end
|
205
199
|
|
206
200
|
protected
|
@@ -212,11 +206,11 @@ module Opener
|
|
212
206
|
parent_node = parent
|
213
207
|
end
|
214
208
|
|
215
|
-
node =
|
209
|
+
node = Nokogiri::XML::Element.new(tag, document)
|
216
210
|
|
217
|
-
parent_node.
|
211
|
+
parent_node.add_child node
|
218
212
|
|
219
|
-
|
213
|
+
node
|
220
214
|
end
|
221
215
|
|
222
216
|
##
|
@@ -224,7 +218,7 @@ module Opener
|
|
224
218
|
# @return [Boolean]
|
225
219
|
#
|
226
220
|
def is_kaf?
|
227
|
-
|
221
|
+
!!document.at_xpath('KAF')
|
228
222
|
end
|
229
223
|
|
230
224
|
##
|
@@ -7,6 +7,8 @@ module Opener
|
|
7
7
|
|
8
8
|
include MonitorMixin
|
9
9
|
|
10
|
+
UPDATE_INTERVAL = (ENV['CACHE_EXPIRE_MINS']&.to_i || 5).minutes
|
11
|
+
|
10
12
|
def initialize
|
11
13
|
super
|
12
14
|
|
@@ -16,13 +18,9 @@ module Opener
|
|
16
18
|
|
17
19
|
def [] **params
|
18
20
|
synchronize do
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
23
|
-
else
|
24
|
-
@cache[params] = cache_update **params
|
25
|
-
end
|
21
|
+
existing = @cache[params]
|
22
|
+
break existing if existing and existing.from > UPDATE_INTERVAL.ago
|
23
|
+
@cache[params] = cache_update existing, **params
|
26
24
|
end
|
27
25
|
end
|
28
26
|
alias_method :get, :[]
|
@@ -31,7 +29,11 @@ module Opener
|
|
31
29
|
from = Time.now
|
32
30
|
lexicons = load_aspects cache: existing, **params
|
33
31
|
|
34
|
-
|
32
|
+
if existing and lexicons.blank?
|
33
|
+
existing.from = from
|
34
|
+
return existing
|
35
|
+
end
|
36
|
+
|
35
37
|
Hashie::Mash.new(
|
36
38
|
aspects: lexicons,
|
37
39
|
from: from,
|
@@ -39,8 +41,8 @@ module Opener
|
|
39
41
|
end
|
40
42
|
|
41
43
|
def load_aspects lang:, cache:, **params
|
42
|
-
url
|
43
|
-
url
|
44
|
+
url = "#{@url}&language_code=#{lang}&#{params.to_query}"
|
45
|
+
url += "&if_updated_since=#{cache.from.utc.iso8601}" if cache
|
44
46
|
puts "#{lang}: loading aspects from #{url}"
|
45
47
|
|
46
48
|
lexicons = JSON.parse HTTPClient.new.get(url).body
|
@@ -28,7 +28,7 @@ Gem::Specification.new do |gem|
|
|
28
28
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
29
29
|
gem.add_dependency 'opener-core', '~> 2.2'
|
30
30
|
|
31
|
-
gem.add_dependency '
|
31
|
+
gem.add_dependency 'nokogiri'
|
32
32
|
gem.add_dependency 'httpclient'
|
33
33
|
gem.add_dependency 'hashie'
|
34
34
|
gem.add_dependency 'activesupport'
|
metadata
CHANGED
@@ -1,170 +1,164 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.3.
|
4
|
+
version: 3.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10
|
11
|
+
date: 2020-12-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
+
name: opener-daemons
|
14
15
|
requirement: !ruby/object:Gem::Requirement
|
15
16
|
requirements:
|
16
17
|
- - "~>"
|
17
18
|
- !ruby/object:Gem::Version
|
18
19
|
version: '2.2'
|
19
|
-
name: opener-daemons
|
20
|
-
prerelease: false
|
21
20
|
type: :runtime
|
21
|
+
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '2.2'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
+
name: opener-webservice
|
28
29
|
requirement: !ruby/object:Gem::Requirement
|
29
30
|
requirements:
|
30
31
|
- - "~>"
|
31
32
|
- !ruby/object:Gem::Version
|
32
33
|
version: '2.1'
|
33
|
-
name: opener-webservice
|
34
|
-
prerelease: false
|
35
34
|
type: :runtime
|
35
|
+
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '2.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
+
name: opener-core
|
42
43
|
requirement: !ruby/object:Gem::Requirement
|
43
44
|
requirements:
|
44
45
|
- - "~>"
|
45
46
|
- !ruby/object:Gem::Version
|
46
47
|
version: '2.2'
|
47
|
-
name: opener-core
|
48
|
-
prerelease: false
|
49
48
|
type: :runtime
|
49
|
+
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '2.2'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
+
name: nokogiri
|
56
57
|
requirement: !ruby/object:Gem::Requirement
|
57
58
|
requirements:
|
58
|
-
- - "~>"
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version: '1.0'
|
61
59
|
- - ">="
|
62
60
|
- !ruby/object:Gem::Version
|
63
|
-
version:
|
64
|
-
name: oga
|
65
|
-
prerelease: false
|
61
|
+
version: '0'
|
66
62
|
type: :runtime
|
63
|
+
prerelease: false
|
67
64
|
version_requirements: !ruby/object:Gem::Requirement
|
68
65
|
requirements:
|
69
|
-
- - "~>"
|
70
|
-
- !ruby/object:Gem::Version
|
71
|
-
version: '1.0'
|
72
66
|
- - ">="
|
73
67
|
- !ruby/object:Gem::Version
|
74
|
-
version:
|
68
|
+
version: '0'
|
75
69
|
- !ruby/object:Gem::Dependency
|
70
|
+
name: httpclient
|
76
71
|
requirement: !ruby/object:Gem::Requirement
|
77
72
|
requirements:
|
78
73
|
- - ">="
|
79
74
|
- !ruby/object:Gem::Version
|
80
75
|
version: '0'
|
81
|
-
name: httpclient
|
82
|
-
prerelease: false
|
83
76
|
type: :runtime
|
77
|
+
prerelease: false
|
84
78
|
version_requirements: !ruby/object:Gem::Requirement
|
85
79
|
requirements:
|
86
80
|
- - ">="
|
87
81
|
- !ruby/object:Gem::Version
|
88
82
|
version: '0'
|
89
83
|
- !ruby/object:Gem::Dependency
|
84
|
+
name: hashie
|
90
85
|
requirement: !ruby/object:Gem::Requirement
|
91
86
|
requirements:
|
92
87
|
- - ">="
|
93
88
|
- !ruby/object:Gem::Version
|
94
89
|
version: '0'
|
95
|
-
name: hashie
|
96
|
-
prerelease: false
|
97
90
|
type: :runtime
|
91
|
+
prerelease: false
|
98
92
|
version_requirements: !ruby/object:Gem::Requirement
|
99
93
|
requirements:
|
100
94
|
- - ">="
|
101
95
|
- !ruby/object:Gem::Version
|
102
96
|
version: '0'
|
103
97
|
- !ruby/object:Gem::Dependency
|
98
|
+
name: activesupport
|
104
99
|
requirement: !ruby/object:Gem::Requirement
|
105
100
|
requirements:
|
106
101
|
- - ">="
|
107
102
|
- !ruby/object:Gem::Version
|
108
103
|
version: '0'
|
109
|
-
name: activesupport
|
110
|
-
prerelease: false
|
111
104
|
type: :runtime
|
105
|
+
prerelease: false
|
112
106
|
version_requirements: !ruby/object:Gem::Requirement
|
113
107
|
requirements:
|
114
108
|
- - ">="
|
115
109
|
- !ruby/object:Gem::Version
|
116
110
|
version: '0'
|
117
111
|
- !ruby/object:Gem::Dependency
|
112
|
+
name: rspec
|
118
113
|
requirement: !ruby/object:Gem::Requirement
|
119
114
|
requirements:
|
120
115
|
- - "~>"
|
121
116
|
- !ruby/object:Gem::Version
|
122
117
|
version: '3.0'
|
123
|
-
name: rspec
|
124
|
-
prerelease: false
|
125
118
|
type: :development
|
119
|
+
prerelease: false
|
126
120
|
version_requirements: !ruby/object:Gem::Requirement
|
127
121
|
requirements:
|
128
122
|
- - "~>"
|
129
123
|
- !ruby/object:Gem::Version
|
130
124
|
version: '3.0'
|
131
125
|
- !ruby/object:Gem::Dependency
|
126
|
+
name: cucumber
|
132
127
|
requirement: !ruby/object:Gem::Requirement
|
133
128
|
requirements:
|
134
129
|
- - ">="
|
135
130
|
- !ruby/object:Gem::Version
|
136
131
|
version: '0'
|
137
|
-
name: cucumber
|
138
|
-
prerelease: false
|
139
132
|
type: :development
|
133
|
+
prerelease: false
|
140
134
|
version_requirements: !ruby/object:Gem::Requirement
|
141
135
|
requirements:
|
142
136
|
- - ">="
|
143
137
|
- !ruby/object:Gem::Version
|
144
138
|
version: '0'
|
145
139
|
- !ruby/object:Gem::Dependency
|
140
|
+
name: rake
|
146
141
|
requirement: !ruby/object:Gem::Requirement
|
147
142
|
requirements:
|
148
143
|
- - ">="
|
149
144
|
- !ruby/object:Gem::Version
|
150
145
|
version: '0'
|
151
|
-
name: rake
|
152
|
-
prerelease: false
|
153
146
|
type: :development
|
147
|
+
prerelease: false
|
154
148
|
version_requirements: !ruby/object:Gem::Requirement
|
155
149
|
requirements:
|
156
150
|
- - ">="
|
157
151
|
- !ruby/object:Gem::Version
|
158
152
|
version: '0'
|
159
153
|
- !ruby/object:Gem::Dependency
|
154
|
+
name: benchmark-ips
|
160
155
|
requirement: !ruby/object:Gem::Requirement
|
161
156
|
requirements:
|
162
157
|
- - "~>"
|
163
158
|
- !ruby/object:Gem::Version
|
164
159
|
version: '2.0'
|
165
|
-
name: benchmark-ips
|
166
|
-
prerelease: false
|
167
160
|
type: :development
|
161
|
+
prerelease: false
|
168
162
|
version_requirements: !ruby/object:Gem::Requirement
|
169
163
|
requirements:
|
170
164
|
- - "~>"
|
@@ -219,7 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
219
213
|
version: '0'
|
220
214
|
requirements: []
|
221
215
|
rubyforge_project:
|
222
|
-
rubygems_version: 2.7.
|
216
|
+
rubygems_version: 2.7.8
|
223
217
|
signing_key:
|
224
218
|
specification_version: 4
|
225
219
|
summary: Property tagger for hotels in Dutch and English.
|