opener-property-tagger 3.3.1 → 3.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/opener/property_tagger.rb +1 -1
- data/lib/opener/property_tagger/file_aspects_cache.rb +1 -1
- data/lib/opener/property_tagger/processor.rb +45 -51
- data/lib/opener/property_tagger/remote_aspects_cache.rb +12 -10
- data/lib/opener/property_tagger/version.rb +1 -1
- data/opener-property-tagger.gemspec +1 -1
- metadata +27 -33
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0bdc478005838e629b52e310437c5cac89158b8f33429d477c7096f1210c3249
|
4
|
+
data.tar.gz: a8fec4e6229cfd39f1cc0f11c3a8007c2c40442fd6ed2f00a558d2cbd8f01c8d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a88d97bd1fff6f57c956024d7d47affe13179a02e068129ddda1be0c577324eafee18c271e465d68dc06a99d7cdd78288e6094f307ab4503ee68c963e618bafc
|
7
|
+
data.tar.gz: 6e601c83e09931821e6d00c03114fbaabb72f8c72781f667eea384b7beb916adc3ccc42f83dbd15434779e2dd3e5f574bcb8ef7be022f3154f0afdabec0107b2
|
@@ -25,16 +25,17 @@ module Opener
|
|
25
25
|
# by default due to the performance overhead.
|
26
26
|
#
|
27
27
|
def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
|
28
|
-
@document =
|
28
|
+
@document = Nokogiri.XML file
|
29
29
|
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
30
30
|
@timestamp = timestamp
|
31
31
|
@pretty = pretty
|
32
32
|
|
33
33
|
@params = params
|
34
|
-
@cache_keys = params[:cache_keys] || {lang: language}
|
35
34
|
@remote = !url.nil?
|
36
35
|
@aspects_path = path
|
37
36
|
@aspects_url = url
|
37
|
+
@cache_keys = params[:cache_keys]
|
38
|
+
@cache_keys.merge! lang: @document.root.attr('xml:lang')
|
38
39
|
|
39
40
|
@aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
|
40
41
|
end
|
@@ -57,70 +58,64 @@ module Opener
|
|
57
58
|
|
58
59
|
add_linguistic_processor
|
59
60
|
|
60
|
-
|
61
|
+
pretty ? pretty_print(document) : document.to_xml
|
61
62
|
end
|
62
63
|
|
63
|
-
##
|
64
|
-
# Get the language of the input file.
|
65
|
-
#
|
66
|
-
# @return [String]
|
67
|
-
#
|
68
64
|
def language
|
69
|
-
|
65
|
+
@language ||= document.at_xpath('KAF').attr('xml:lang')
|
70
66
|
end
|
71
67
|
|
72
|
-
##
|
73
|
-
# Get the terms from the input file
|
74
|
-
# @return [Hash]
|
75
|
-
#
|
76
68
|
def terms
|
77
69
|
unless @terms
|
78
70
|
@terms = {}
|
79
71
|
|
80
72
|
document.xpath('KAF/terms/term').each do |term|
|
81
|
-
@terms[term.
|
73
|
+
@terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
|
82
74
|
end
|
83
75
|
end
|
84
76
|
|
85
|
-
|
77
|
+
@terms
|
86
78
|
end
|
87
79
|
|
88
80
|
##
|
89
81
|
# Check which terms belong to an aspect (property)
|
82
|
+
# Text have priority over Lemmas, overriding if there is a conflict
|
90
83
|
# @return [Hash]
|
91
84
|
#
|
92
85
|
def extract_aspects
|
93
|
-
term_ids
|
94
|
-
lemmas
|
86
|
+
term_ids = terms.keys
|
87
|
+
lemmas = terms.values
|
88
|
+
uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
|
95
89
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
90
|
+
[:lemma, :text].each do |k|
|
91
|
+
current_token = 0
|
92
|
+
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
93
|
+
# lemmas) belong to a property.
|
94
|
+
max_ngram = 2
|
100
95
|
|
101
|
-
uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
|
102
96
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
97
|
+
while current_token < terms.count
|
98
|
+
(0..max_ngram).each do |tam_ngram|
|
99
|
+
if current_token + tam_ngram <= terms.count
|
100
|
+
ngram = lemmas[current_token..current_token+tam_ngram].map{|a| a[k] }.join(" ").downcase
|
107
101
|
|
108
|
-
|
109
|
-
|
110
|
-
|
102
|
+
if aspects[ngram.to_sym]
|
103
|
+
properties = aspects[ngram.to_sym]
|
104
|
+
ids = term_ids[current_token..current_token+tam_ngram]
|
111
105
|
|
112
|
-
|
113
|
-
|
106
|
+
properties.uniq.each do |property|
|
107
|
+
next if !property or property.strip.empty?
|
114
108
|
|
115
|
-
|
109
|
+
uniq_aspects[property.to_sym] << [ids,ngram] unless uniq_aspects[property.to_sym].include? [ids,ngram]
|
110
|
+
end
|
116
111
|
end
|
117
112
|
end
|
118
113
|
end
|
114
|
+
current_token += 1
|
119
115
|
end
|
120
|
-
current_token += 1
|
121
116
|
end
|
122
117
|
|
123
|
-
|
118
|
+
Hash[uniq_aspects.sort]
|
124
119
|
end
|
125
120
|
|
126
121
|
##
|
@@ -143,22 +138,21 @@ module Opener
|
|
143
138
|
def add_property(key, value, index)
|
144
139
|
property_node = new_node("property", "KAF/features/properties")
|
145
140
|
|
146
|
-
property_node
|
147
|
-
property_node
|
141
|
+
property_node['lemma'] = key.to_s
|
142
|
+
property_node['pid'] = "p#{index.to_s}"
|
148
143
|
|
149
144
|
references_node = new_node("references", property_node)
|
150
145
|
|
151
146
|
value.uniq.each do |v|
|
152
|
-
|
153
|
-
|
154
|
-
references_node.children << comment
|
147
|
+
comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.last} ")
|
148
|
+
references_node.add_child comm_node
|
155
149
|
|
156
150
|
span_node = new_node("span", references_node)
|
157
151
|
|
158
152
|
v.first.each do |val|
|
159
|
-
target_node
|
153
|
+
target_node = new_node("target", span_node)
|
160
154
|
|
161
|
-
target_node
|
155
|
+
target_node['id'] = val.to_s
|
162
156
|
end
|
163
157
|
end
|
164
158
|
end
|
@@ -169,19 +163,19 @@ module Opener
|
|
169
163
|
version = '2.0'
|
170
164
|
|
171
165
|
node = new_node('linguisticProcessors', 'KAF/kafHeader')
|
172
|
-
node
|
166
|
+
node['layer'] = 'features'
|
173
167
|
|
174
168
|
lp_node = new_node('lp', node)
|
175
169
|
|
176
|
-
lp_node
|
177
|
-
lp_node
|
170
|
+
lp_node['version'] = "#{last_edited}-#{version}"
|
171
|
+
lp_node['name'] = description
|
178
172
|
|
179
173
|
if timestamp
|
180
174
|
format = '%Y-%m-%dT%H:%M:%S%Z'
|
181
175
|
|
182
|
-
lp_node
|
176
|
+
lp_node['timestamp'] = Time.now.strftime(format)
|
183
177
|
else
|
184
|
-
lp_node
|
178
|
+
lp_node['timestamp'] = '*'
|
185
179
|
end
|
186
180
|
end
|
187
181
|
|
@@ -200,7 +194,7 @@ module Opener
|
|
200
194
|
formatter.compact = true
|
201
195
|
formatter.write(doc, out)
|
202
196
|
|
203
|
-
|
197
|
+
out.strip
|
204
198
|
end
|
205
199
|
|
206
200
|
protected
|
@@ -212,11 +206,11 @@ module Opener
|
|
212
206
|
parent_node = parent
|
213
207
|
end
|
214
208
|
|
215
|
-
node =
|
209
|
+
node = Nokogiri::XML::Element.new(tag, document)
|
216
210
|
|
217
|
-
parent_node.
|
211
|
+
parent_node.add_child node
|
218
212
|
|
219
|
-
|
213
|
+
node
|
220
214
|
end
|
221
215
|
|
222
216
|
##
|
@@ -224,7 +218,7 @@ module Opener
|
|
224
218
|
# @return [Boolean]
|
225
219
|
#
|
226
220
|
def is_kaf?
|
227
|
-
|
221
|
+
!!document.at_xpath('KAF')
|
228
222
|
end
|
229
223
|
|
230
224
|
##
|
@@ -7,6 +7,8 @@ module Opener
|
|
7
7
|
|
8
8
|
include MonitorMixin
|
9
9
|
|
10
|
+
UPDATE_INTERVAL = (ENV['CACHE_EXPIRE_MINS']&.to_i || 5).minutes
|
11
|
+
|
10
12
|
def initialize
|
11
13
|
super
|
12
14
|
|
@@ -16,13 +18,9 @@ module Opener
|
|
16
18
|
|
17
19
|
def [] **params
|
18
20
|
synchronize do
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
23
|
-
else
|
24
|
-
@cache[params] = cache_update **params
|
25
|
-
end
|
21
|
+
existing = @cache[params]
|
22
|
+
break existing if existing and existing.from > UPDATE_INTERVAL.ago
|
23
|
+
@cache[params] = cache_update existing, **params
|
26
24
|
end
|
27
25
|
end
|
28
26
|
alias_method :get, :[]
|
@@ -31,7 +29,11 @@ module Opener
|
|
31
29
|
from = Time.now
|
32
30
|
lexicons = load_aspects cache: existing, **params
|
33
31
|
|
34
|
-
|
32
|
+
if existing and lexicons.blank?
|
33
|
+
existing.from = from
|
34
|
+
return existing
|
35
|
+
end
|
36
|
+
|
35
37
|
Hashie::Mash.new(
|
36
38
|
aspects: lexicons,
|
37
39
|
from: from,
|
@@ -39,8 +41,8 @@ module Opener
|
|
39
41
|
end
|
40
42
|
|
41
43
|
def load_aspects lang:, cache:, **params
|
42
|
-
url
|
43
|
-
url
|
44
|
+
url = "#{@url}&language_code=#{lang}&#{params.to_query}"
|
45
|
+
url += "&if_updated_since=#{cache.from.utc.iso8601}" if cache
|
44
46
|
puts "#{lang}: loading aspects from #{url}"
|
45
47
|
|
46
48
|
lexicons = JSON.parse HTTPClient.new.get(url).body
|
@@ -28,7 +28,7 @@ Gem::Specification.new do |gem|
|
|
28
28
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
29
29
|
gem.add_dependency 'opener-core', '~> 2.2'
|
30
30
|
|
31
|
-
gem.add_dependency '
|
31
|
+
gem.add_dependency 'nokogiri'
|
32
32
|
gem.add_dependency 'httpclient'
|
33
33
|
gem.add_dependency 'hashie'
|
34
34
|
gem.add_dependency 'activesupport'
|
metadata
CHANGED
@@ -1,170 +1,164 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.3.
|
4
|
+
version: 3.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10
|
11
|
+
date: 2020-12-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
+
name: opener-daemons
|
14
15
|
requirement: !ruby/object:Gem::Requirement
|
15
16
|
requirements:
|
16
17
|
- - "~>"
|
17
18
|
- !ruby/object:Gem::Version
|
18
19
|
version: '2.2'
|
19
|
-
name: opener-daemons
|
20
|
-
prerelease: false
|
21
20
|
type: :runtime
|
21
|
+
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '2.2'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
+
name: opener-webservice
|
28
29
|
requirement: !ruby/object:Gem::Requirement
|
29
30
|
requirements:
|
30
31
|
- - "~>"
|
31
32
|
- !ruby/object:Gem::Version
|
32
33
|
version: '2.1'
|
33
|
-
name: opener-webservice
|
34
|
-
prerelease: false
|
35
34
|
type: :runtime
|
35
|
+
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '2.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
+
name: opener-core
|
42
43
|
requirement: !ruby/object:Gem::Requirement
|
43
44
|
requirements:
|
44
45
|
- - "~>"
|
45
46
|
- !ruby/object:Gem::Version
|
46
47
|
version: '2.2'
|
47
|
-
name: opener-core
|
48
|
-
prerelease: false
|
49
48
|
type: :runtime
|
49
|
+
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '2.2'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
+
name: nokogiri
|
56
57
|
requirement: !ruby/object:Gem::Requirement
|
57
58
|
requirements:
|
58
|
-
- - "~>"
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version: '1.0'
|
61
59
|
- - ">="
|
62
60
|
- !ruby/object:Gem::Version
|
63
|
-
version:
|
64
|
-
name: oga
|
65
|
-
prerelease: false
|
61
|
+
version: '0'
|
66
62
|
type: :runtime
|
63
|
+
prerelease: false
|
67
64
|
version_requirements: !ruby/object:Gem::Requirement
|
68
65
|
requirements:
|
69
|
-
- - "~>"
|
70
|
-
- !ruby/object:Gem::Version
|
71
|
-
version: '1.0'
|
72
66
|
- - ">="
|
73
67
|
- !ruby/object:Gem::Version
|
74
|
-
version:
|
68
|
+
version: '0'
|
75
69
|
- !ruby/object:Gem::Dependency
|
70
|
+
name: httpclient
|
76
71
|
requirement: !ruby/object:Gem::Requirement
|
77
72
|
requirements:
|
78
73
|
- - ">="
|
79
74
|
- !ruby/object:Gem::Version
|
80
75
|
version: '0'
|
81
|
-
name: httpclient
|
82
|
-
prerelease: false
|
83
76
|
type: :runtime
|
77
|
+
prerelease: false
|
84
78
|
version_requirements: !ruby/object:Gem::Requirement
|
85
79
|
requirements:
|
86
80
|
- - ">="
|
87
81
|
- !ruby/object:Gem::Version
|
88
82
|
version: '0'
|
89
83
|
- !ruby/object:Gem::Dependency
|
84
|
+
name: hashie
|
90
85
|
requirement: !ruby/object:Gem::Requirement
|
91
86
|
requirements:
|
92
87
|
- - ">="
|
93
88
|
- !ruby/object:Gem::Version
|
94
89
|
version: '0'
|
95
|
-
name: hashie
|
96
|
-
prerelease: false
|
97
90
|
type: :runtime
|
91
|
+
prerelease: false
|
98
92
|
version_requirements: !ruby/object:Gem::Requirement
|
99
93
|
requirements:
|
100
94
|
- - ">="
|
101
95
|
- !ruby/object:Gem::Version
|
102
96
|
version: '0'
|
103
97
|
- !ruby/object:Gem::Dependency
|
98
|
+
name: activesupport
|
104
99
|
requirement: !ruby/object:Gem::Requirement
|
105
100
|
requirements:
|
106
101
|
- - ">="
|
107
102
|
- !ruby/object:Gem::Version
|
108
103
|
version: '0'
|
109
|
-
name: activesupport
|
110
|
-
prerelease: false
|
111
104
|
type: :runtime
|
105
|
+
prerelease: false
|
112
106
|
version_requirements: !ruby/object:Gem::Requirement
|
113
107
|
requirements:
|
114
108
|
- - ">="
|
115
109
|
- !ruby/object:Gem::Version
|
116
110
|
version: '0'
|
117
111
|
- !ruby/object:Gem::Dependency
|
112
|
+
name: rspec
|
118
113
|
requirement: !ruby/object:Gem::Requirement
|
119
114
|
requirements:
|
120
115
|
- - "~>"
|
121
116
|
- !ruby/object:Gem::Version
|
122
117
|
version: '3.0'
|
123
|
-
name: rspec
|
124
|
-
prerelease: false
|
125
118
|
type: :development
|
119
|
+
prerelease: false
|
126
120
|
version_requirements: !ruby/object:Gem::Requirement
|
127
121
|
requirements:
|
128
122
|
- - "~>"
|
129
123
|
- !ruby/object:Gem::Version
|
130
124
|
version: '3.0'
|
131
125
|
- !ruby/object:Gem::Dependency
|
126
|
+
name: cucumber
|
132
127
|
requirement: !ruby/object:Gem::Requirement
|
133
128
|
requirements:
|
134
129
|
- - ">="
|
135
130
|
- !ruby/object:Gem::Version
|
136
131
|
version: '0'
|
137
|
-
name: cucumber
|
138
|
-
prerelease: false
|
139
132
|
type: :development
|
133
|
+
prerelease: false
|
140
134
|
version_requirements: !ruby/object:Gem::Requirement
|
141
135
|
requirements:
|
142
136
|
- - ">="
|
143
137
|
- !ruby/object:Gem::Version
|
144
138
|
version: '0'
|
145
139
|
- !ruby/object:Gem::Dependency
|
140
|
+
name: rake
|
146
141
|
requirement: !ruby/object:Gem::Requirement
|
147
142
|
requirements:
|
148
143
|
- - ">="
|
149
144
|
- !ruby/object:Gem::Version
|
150
145
|
version: '0'
|
151
|
-
name: rake
|
152
|
-
prerelease: false
|
153
146
|
type: :development
|
147
|
+
prerelease: false
|
154
148
|
version_requirements: !ruby/object:Gem::Requirement
|
155
149
|
requirements:
|
156
150
|
- - ">="
|
157
151
|
- !ruby/object:Gem::Version
|
158
152
|
version: '0'
|
159
153
|
- !ruby/object:Gem::Dependency
|
154
|
+
name: benchmark-ips
|
160
155
|
requirement: !ruby/object:Gem::Requirement
|
161
156
|
requirements:
|
162
157
|
- - "~>"
|
163
158
|
- !ruby/object:Gem::Version
|
164
159
|
version: '2.0'
|
165
|
-
name: benchmark-ips
|
166
|
-
prerelease: false
|
167
160
|
type: :development
|
161
|
+
prerelease: false
|
168
162
|
version_requirements: !ruby/object:Gem::Requirement
|
169
163
|
requirements:
|
170
164
|
- - "~>"
|
@@ -219,7 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
219
213
|
version: '0'
|
220
214
|
requirements: []
|
221
215
|
rubyforge_project:
|
222
|
-
rubygems_version: 2.7.
|
216
|
+
rubygems_version: 2.7.8
|
223
217
|
signing_key:
|
224
218
|
specification_version: 4
|
225
219
|
summary: Property tagger for hotels in Dutch and English.
|