opener-property-tagger 3.3.3 → 3.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/opener/property_tagger.rb +1 -1
- data/lib/opener/property_tagger/file_aspects_cache.rb +8 -3
- data/lib/opener/property_tagger/processor.rb +65 -67
- data/lib/opener/property_tagger/remote_aspects_cache.rb +8 -4
- data/lib/opener/property_tagger/version.rb +1 -1
- data/opener-property-tagger.gemspec +1 -1
- metadata +6 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cd341e6aded0a9e7690a95a6d56ad68fd530df0b55458a54e3831b0b3a743393
|
4
|
+
data.tar.gz: 1ecc43a166b60c8ee56dcd5345efbbb792acfac285a4c2b1d1448c0b5aa91c79
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: efb8fb35c77c6886d01929d04e29d590e25ff0ecb70e98110c85c221d4e47557bca951793ea278a994f48660e1016c4c1df08e13c552f7b601b8fb65734f3293
|
7
|
+
data.tar.gz: 6f51528b4be7f5eb5985720914ac555066f7105832e8459db68eb729dfa4d45b8023751f491ce06776ca9af784f27951e6225246c1305324ce2c539452940110
|
@@ -33,12 +33,17 @@ module Opener
|
|
33
33
|
# @param [String] path
|
34
34
|
#
|
35
35
|
def load_aspects(path)
|
36
|
-
mapping = Hash.new
|
36
|
+
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
37
37
|
|
38
|
-
File.foreach
|
38
|
+
File.foreach path do |line|
|
39
39
|
lemma, pos, aspect = line.chomp.split("\t")
|
40
|
+
l = Hashie::Mash.new(
|
41
|
+
lemma: lemma,
|
42
|
+
pos: pos,
|
43
|
+
aspect: aspect,
|
44
|
+
)
|
40
45
|
|
41
|
-
mapping[lemma.to_sym] <<
|
46
|
+
mapping[l.lemma.to_sym] << l
|
42
47
|
end
|
43
48
|
|
44
49
|
return mapping
|
@@ -6,7 +6,8 @@ module Opener
|
|
6
6
|
class Processor
|
7
7
|
|
8
8
|
attr_accessor :document
|
9
|
-
attr_accessor :
|
9
|
+
attr_accessor :aspects_path, :aspects_url
|
10
|
+
attr_accessor :aspects, :lexicons
|
10
11
|
attr_accessor :timestamp, :pretty
|
11
12
|
|
12
13
|
##
|
@@ -25,18 +26,19 @@ module Opener
|
|
25
26
|
# by default due to the performance overhead.
|
26
27
|
#
|
27
28
|
def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
|
28
|
-
@document =
|
29
|
+
@document = Nokogiri.XML file
|
29
30
|
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
30
31
|
@timestamp = timestamp
|
31
32
|
@pretty = pretty
|
32
33
|
|
33
34
|
@params = params
|
34
|
-
@cache_keys = params[:cache_keys] || {lang: language}
|
35
35
|
@remote = !url.nil?
|
36
36
|
@aspects_path = path
|
37
37
|
@aspects_url = url
|
38
|
+
@cache_keys = params[:cache_keys] || {}
|
39
|
+
@cache_keys.merge! lang: @document.root.attr('xml:lang')
|
38
40
|
|
39
|
-
@
|
41
|
+
@lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
|
40
42
|
end
|
41
43
|
|
42
44
|
##
|
@@ -44,83 +46,79 @@ module Opener
|
|
44
46
|
# @return [String]
|
45
47
|
#
|
46
48
|
def process
|
47
|
-
existing_aspects = extract_aspects
|
48
|
-
|
49
49
|
add_features_layer
|
50
50
|
add_properties_layer
|
51
51
|
|
52
|
-
|
52
|
+
extract_aspects.each.with_index do |(lemma, values), index|
|
53
53
|
index += 1
|
54
54
|
|
55
|
-
add_property
|
55
|
+
add_property lemma, values, index
|
56
56
|
end
|
57
57
|
|
58
58
|
add_linguistic_processor
|
59
59
|
|
60
|
-
|
60
|
+
pretty ? pretty_print(document) : document.to_xml
|
61
61
|
end
|
62
62
|
|
63
|
-
##
|
64
|
-
# Get the language of the input file.
|
65
|
-
#
|
66
|
-
# @return [String]
|
67
|
-
#
|
68
63
|
def language
|
69
|
-
|
64
|
+
@language ||= document.at_xpath('KAF').attr('xml:lang')
|
70
65
|
end
|
71
66
|
|
72
|
-
##
|
73
|
-
# Get the terms from the input file
|
74
|
-
# @return [Hash]
|
75
|
-
#
|
76
67
|
def terms
|
77
68
|
unless @terms
|
78
69
|
@terms = {}
|
79
70
|
|
80
71
|
document.xpath('KAF/terms/term').each do |term|
|
81
|
-
@terms[term.
|
72
|
+
@terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
|
82
73
|
end
|
83
74
|
end
|
84
75
|
|
85
|
-
|
76
|
+
@terms
|
86
77
|
end
|
87
78
|
|
79
|
+
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
80
|
+
# lemmas) belong to a property.
|
81
|
+
MAX_NGRAM = 2
|
82
|
+
|
88
83
|
##
|
89
84
|
# Check which terms belong to an aspect (property)
|
85
|
+
# Text have priority over Lemmas, overriding if there is a conflict
|
90
86
|
# @return [Hash]
|
91
87
|
#
|
92
88
|
def extract_aspects
|
93
|
-
|
94
|
-
lemmas
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
ngram
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
89
|
+
all_term_ids = terms.keys
|
90
|
+
lemmas = terms.values
|
91
|
+
uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
|
92
|
+
|
93
|
+
[:lemma, :text].each do |k|
|
94
|
+
current_token = 0
|
95
|
+
|
96
|
+
while current_token < terms.count
|
97
|
+
(0..MAX_NGRAM).each do |tam_ngram|
|
98
|
+
next unless current_token + tam_ngram <= terms.count
|
99
|
+
|
100
|
+
ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
|
101
|
+
|
102
|
+
@lexicons[ngram.to_sym]&.each do |l|
|
103
|
+
properties = if l.aspects.present? then l.aspects else [l.aspect] end
|
104
|
+
properties.each do |p|
|
105
|
+
next if p.blank?
|
106
|
+
term_ids = all_term_ids[current_token..current_token+tam_ngram]
|
107
|
+
next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
|
108
|
+
|
109
|
+
uniq_aspects[p.to_sym] << Hashie::Mash.new(
|
110
|
+
term_ids: term_ids,
|
111
|
+
ngram: ngram,
|
112
|
+
lexicon: l,
|
113
|
+
)
|
116
114
|
end
|
117
115
|
end
|
118
116
|
end
|
117
|
+
current_token += 1
|
119
118
|
end
|
120
|
-
current_token += 1
|
121
119
|
end
|
122
120
|
|
123
|
-
|
121
|
+
Hash[uniq_aspects.sort]
|
124
122
|
end
|
125
123
|
|
126
124
|
##
|
@@ -140,25 +138,25 @@ module Opener
|
|
140
138
|
new_node("properties", "KAF/features")
|
141
139
|
end
|
142
140
|
|
143
|
-
def add_property
|
141
|
+
def add_property lemma, values, index
|
144
142
|
property_node = new_node("property", "KAF/features/properties")
|
145
143
|
|
146
|
-
property_node
|
147
|
-
property_node
|
144
|
+
property_node['lemma'] = lemma.to_s
|
145
|
+
property_node['pid'] = "p#{index.to_s}"
|
148
146
|
|
149
147
|
references_node = new_node("references", property_node)
|
150
148
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
references_node.children << comment
|
149
|
+
values.each do |v|
|
150
|
+
comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
|
151
|
+
references_node.add_child comm_node
|
155
152
|
|
156
|
-
span_node = new_node
|
153
|
+
span_node = new_node 'span', references_node
|
157
154
|
|
158
|
-
v.
|
159
|
-
target_node
|
155
|
+
v.term_ids.each do |id|
|
156
|
+
target_node = new_node 'target', span_node
|
160
157
|
|
161
|
-
target_node
|
158
|
+
target_node['id'] = id.to_s
|
159
|
+
target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
|
162
160
|
end
|
163
161
|
end
|
164
162
|
end
|
@@ -169,19 +167,19 @@ module Opener
|
|
169
167
|
version = '2.0'
|
170
168
|
|
171
169
|
node = new_node('linguisticProcessors', 'KAF/kafHeader')
|
172
|
-
node
|
170
|
+
node['layer'] = 'features'
|
173
171
|
|
174
172
|
lp_node = new_node('lp', node)
|
175
173
|
|
176
|
-
lp_node
|
177
|
-
lp_node
|
174
|
+
lp_node['version'] = "#{last_edited}-#{version}"
|
175
|
+
lp_node['name'] = description
|
178
176
|
|
179
177
|
if timestamp
|
180
178
|
format = '%Y-%m-%dT%H:%M:%S%Z'
|
181
179
|
|
182
|
-
lp_node
|
180
|
+
lp_node['timestamp'] = Time.now.strftime(format)
|
183
181
|
else
|
184
|
-
lp_node
|
182
|
+
lp_node['timestamp'] = '*'
|
185
183
|
end
|
186
184
|
end
|
187
185
|
|
@@ -200,7 +198,7 @@ module Opener
|
|
200
198
|
formatter.compact = true
|
201
199
|
formatter.write(doc, out)
|
202
200
|
|
203
|
-
|
201
|
+
out.strip
|
204
202
|
end
|
205
203
|
|
206
204
|
protected
|
@@ -212,11 +210,11 @@ module Opener
|
|
212
210
|
parent_node = parent
|
213
211
|
end
|
214
212
|
|
215
|
-
node =
|
213
|
+
node = Nokogiri::XML::Element.new(tag, document)
|
216
214
|
|
217
|
-
parent_node.
|
215
|
+
parent_node.add_child node
|
218
216
|
|
219
|
-
|
217
|
+
node
|
220
218
|
end
|
221
219
|
|
222
220
|
##
|
@@ -224,7 +222,7 @@ module Opener
|
|
224
222
|
# @return [Boolean]
|
225
223
|
#
|
226
224
|
def is_kaf?
|
227
|
-
|
225
|
+
!!document.at_xpath('KAF')
|
228
226
|
end
|
229
227
|
|
230
228
|
##
|
@@ -17,10 +17,11 @@ module Opener
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def [] **params
|
20
|
+
existing = @cache[params]
|
21
|
+
return existing if existing and existing.from > UPDATE_INTERVAL.ago
|
22
|
+
|
20
23
|
synchronize do
|
21
|
-
|
22
|
-
break existing if existing and existing.from > UPDATE_INTERVAL.ago
|
23
|
-
@cache[params] = cache_update existing, **params
|
24
|
+
@cache[params] = cache_update @cache[params], **params
|
24
25
|
end
|
25
26
|
end
|
26
27
|
alias_method :get, :[]
|
@@ -49,7 +50,10 @@ module Opener
|
|
49
50
|
lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
|
50
51
|
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
51
52
|
lexicons.each do |l|
|
52
|
-
mapping[l.lemma.to_sym] << l
|
53
|
+
mapping[l.lemma.to_sym] << l
|
54
|
+
l.variants&.each do |v|
|
55
|
+
mapping[v.lemma.to_sym] << l
|
56
|
+
end
|
53
57
|
end
|
54
58
|
|
55
59
|
mapping
|
@@ -28,7 +28,7 @@ Gem::Specification.new do |gem|
|
|
28
28
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
29
29
|
gem.add_dependency 'opener-core', '~> 2.2'
|
30
30
|
|
31
|
-
gem.add_dependency '
|
31
|
+
gem.add_dependency 'nokogiri'
|
32
32
|
gem.add_dependency 'httpclient'
|
33
33
|
gem.add_dependency 'hashie'
|
34
34
|
gem.add_dependency 'activesupport'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
@@ -53,25 +53,19 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '2.2'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: nokogiri
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "~>"
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '1.0'
|
62
59
|
- - ">="
|
63
60
|
- !ruby/object:Gem::Version
|
64
|
-
version:
|
61
|
+
version: '0'
|
65
62
|
type: :runtime
|
66
63
|
prerelease: false
|
67
64
|
version_requirements: !ruby/object:Gem::Requirement
|
68
65
|
requirements:
|
69
|
-
- - "~>"
|
70
|
-
- !ruby/object:Gem::Version
|
71
|
-
version: '1.0'
|
72
66
|
- - ">="
|
73
67
|
- !ruby/object:Gem::Version
|
74
|
-
version:
|
68
|
+
version: '0'
|
75
69
|
- !ruby/object:Gem::Dependency
|
76
70
|
name: httpclient
|
77
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -173,9 +167,9 @@ dependencies:
|
|
173
167
|
description: Property tagger for hotels in Dutch and English.
|
174
168
|
email:
|
175
169
|
executables:
|
176
|
-
- property-tagger-server
|
177
170
|
- property-tagger
|
178
171
|
- property-tagger-daemon
|
172
|
+
- property-tagger-server
|
179
173
|
extensions: []
|
180
174
|
extra_rdoc_files: []
|
181
175
|
files:
|