opener-property-tagger 3.3.3 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/opener/property_tagger.rb +1 -1
- data/lib/opener/property_tagger/file_aspects_cache.rb +8 -3
- data/lib/opener/property_tagger/processor.rb +65 -67
- data/lib/opener/property_tagger/remote_aspects_cache.rb +8 -4
- data/lib/opener/property_tagger/version.rb +1 -1
- data/opener-property-tagger.gemspec +1 -1
- metadata +6 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cd341e6aded0a9e7690a95a6d56ad68fd530df0b55458a54e3831b0b3a743393
|
4
|
+
data.tar.gz: 1ecc43a166b60c8ee56dcd5345efbbb792acfac285a4c2b1d1448c0b5aa91c79
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: efb8fb35c77c6886d01929d04e29d590e25ff0ecb70e98110c85c221d4e47557bca951793ea278a994f48660e1016c4c1df08e13c552f7b601b8fb65734f3293
|
7
|
+
data.tar.gz: 6f51528b4be7f5eb5985720914ac555066f7105832e8459db68eb729dfa4d45b8023751f491ce06776ca9af784f27951e6225246c1305324ce2c539452940110
|
@@ -33,12 +33,17 @@ module Opener
|
|
33
33
|
# @param [String] path
|
34
34
|
#
|
35
35
|
def load_aspects(path)
|
36
|
-
mapping = Hash.new
|
36
|
+
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
37
37
|
|
38
|
-
File.foreach
|
38
|
+
File.foreach path do |line|
|
39
39
|
lemma, pos, aspect = line.chomp.split("\t")
|
40
|
+
l = Hashie::Mash.new(
|
41
|
+
lemma: lemma,
|
42
|
+
pos: pos,
|
43
|
+
aspect: aspect,
|
44
|
+
)
|
40
45
|
|
41
|
-
mapping[lemma.to_sym] <<
|
46
|
+
mapping[l.lemma.to_sym] << l
|
42
47
|
end
|
43
48
|
|
44
49
|
return mapping
|
@@ -6,7 +6,8 @@ module Opener
|
|
6
6
|
class Processor
|
7
7
|
|
8
8
|
attr_accessor :document
|
9
|
-
attr_accessor :
|
9
|
+
attr_accessor :aspects_path, :aspects_url
|
10
|
+
attr_accessor :aspects, :lexicons
|
10
11
|
attr_accessor :timestamp, :pretty
|
11
12
|
|
12
13
|
##
|
@@ -25,18 +26,19 @@ module Opener
|
|
25
26
|
# by default due to the performance overhead.
|
26
27
|
#
|
27
28
|
def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
|
28
|
-
@document =
|
29
|
+
@document = Nokogiri.XML file
|
29
30
|
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
30
31
|
@timestamp = timestamp
|
31
32
|
@pretty = pretty
|
32
33
|
|
33
34
|
@params = params
|
34
|
-
@cache_keys = params[:cache_keys] || {lang: language}
|
35
35
|
@remote = !url.nil?
|
36
36
|
@aspects_path = path
|
37
37
|
@aspects_url = url
|
38
|
+
@cache_keys = params[:cache_keys] || {}
|
39
|
+
@cache_keys.merge! lang: @document.root.attr('xml:lang')
|
38
40
|
|
39
|
-
@
|
41
|
+
@lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
|
40
42
|
end
|
41
43
|
|
42
44
|
##
|
@@ -44,83 +46,79 @@ module Opener
|
|
44
46
|
# @return [String]
|
45
47
|
#
|
46
48
|
def process
|
47
|
-
existing_aspects = extract_aspects
|
48
|
-
|
49
49
|
add_features_layer
|
50
50
|
add_properties_layer
|
51
51
|
|
52
|
-
|
52
|
+
extract_aspects.each.with_index do |(lemma, values), index|
|
53
53
|
index += 1
|
54
54
|
|
55
|
-
add_property
|
55
|
+
add_property lemma, values, index
|
56
56
|
end
|
57
57
|
|
58
58
|
add_linguistic_processor
|
59
59
|
|
60
|
-
|
60
|
+
pretty ? pretty_print(document) : document.to_xml
|
61
61
|
end
|
62
62
|
|
63
|
-
##
|
64
|
-
# Get the language of the input file.
|
65
|
-
#
|
66
|
-
# @return [String]
|
67
|
-
#
|
68
63
|
def language
|
69
|
-
|
64
|
+
@language ||= document.at_xpath('KAF').attr('xml:lang')
|
70
65
|
end
|
71
66
|
|
72
|
-
##
|
73
|
-
# Get the terms from the input file
|
74
|
-
# @return [Hash]
|
75
|
-
#
|
76
67
|
def terms
|
77
68
|
unless @terms
|
78
69
|
@terms = {}
|
79
70
|
|
80
71
|
document.xpath('KAF/terms/term').each do |term|
|
81
|
-
@terms[term.
|
72
|
+
@terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
|
82
73
|
end
|
83
74
|
end
|
84
75
|
|
85
|
-
|
76
|
+
@terms
|
86
77
|
end
|
87
78
|
|
79
|
+
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
80
|
+
# lemmas) belong to a property.
|
81
|
+
MAX_NGRAM = 2
|
82
|
+
|
88
83
|
##
|
89
84
|
# Check which terms belong to an aspect (property)
|
85
|
+
# Text have priority over Lemmas, overriding if there is a conflict
|
90
86
|
# @return [Hash]
|
91
87
|
#
|
92
88
|
def extract_aspects
|
93
|
-
|
94
|
-
lemmas
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
ngram
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
89
|
+
all_term_ids = terms.keys
|
90
|
+
lemmas = terms.values
|
91
|
+
uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
|
92
|
+
|
93
|
+
[:lemma, :text].each do |k|
|
94
|
+
current_token = 0
|
95
|
+
|
96
|
+
while current_token < terms.count
|
97
|
+
(0..MAX_NGRAM).each do |tam_ngram|
|
98
|
+
next unless current_token + tam_ngram <= terms.count
|
99
|
+
|
100
|
+
ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
|
101
|
+
|
102
|
+
@lexicons[ngram.to_sym]&.each do |l|
|
103
|
+
properties = if l.aspects.present? then l.aspects else [l.aspect] end
|
104
|
+
properties.each do |p|
|
105
|
+
next if p.blank?
|
106
|
+
term_ids = all_term_ids[current_token..current_token+tam_ngram]
|
107
|
+
next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
|
108
|
+
|
109
|
+
uniq_aspects[p.to_sym] << Hashie::Mash.new(
|
110
|
+
term_ids: term_ids,
|
111
|
+
ngram: ngram,
|
112
|
+
lexicon: l,
|
113
|
+
)
|
116
114
|
end
|
117
115
|
end
|
118
116
|
end
|
117
|
+
current_token += 1
|
119
118
|
end
|
120
|
-
current_token += 1
|
121
119
|
end
|
122
120
|
|
123
|
-
|
121
|
+
Hash[uniq_aspects.sort]
|
124
122
|
end
|
125
123
|
|
126
124
|
##
|
@@ -140,25 +138,25 @@ module Opener
|
|
140
138
|
new_node("properties", "KAF/features")
|
141
139
|
end
|
142
140
|
|
143
|
-
def add_property
|
141
|
+
def add_property lemma, values, index
|
144
142
|
property_node = new_node("property", "KAF/features/properties")
|
145
143
|
|
146
|
-
property_node
|
147
|
-
property_node
|
144
|
+
property_node['lemma'] = lemma.to_s
|
145
|
+
property_node['pid'] = "p#{index.to_s}"
|
148
146
|
|
149
147
|
references_node = new_node("references", property_node)
|
150
148
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
references_node.children << comment
|
149
|
+
values.each do |v|
|
150
|
+
comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
|
151
|
+
references_node.add_child comm_node
|
155
152
|
|
156
|
-
span_node = new_node
|
153
|
+
span_node = new_node 'span', references_node
|
157
154
|
|
158
|
-
v.
|
159
|
-
target_node
|
155
|
+
v.term_ids.each do |id|
|
156
|
+
target_node = new_node 'target', span_node
|
160
157
|
|
161
|
-
target_node
|
158
|
+
target_node['id'] = id.to_s
|
159
|
+
target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
|
162
160
|
end
|
163
161
|
end
|
164
162
|
end
|
@@ -169,19 +167,19 @@ module Opener
|
|
169
167
|
version = '2.0'
|
170
168
|
|
171
169
|
node = new_node('linguisticProcessors', 'KAF/kafHeader')
|
172
|
-
node
|
170
|
+
node['layer'] = 'features'
|
173
171
|
|
174
172
|
lp_node = new_node('lp', node)
|
175
173
|
|
176
|
-
lp_node
|
177
|
-
lp_node
|
174
|
+
lp_node['version'] = "#{last_edited}-#{version}"
|
175
|
+
lp_node['name'] = description
|
178
176
|
|
179
177
|
if timestamp
|
180
178
|
format = '%Y-%m-%dT%H:%M:%S%Z'
|
181
179
|
|
182
|
-
lp_node
|
180
|
+
lp_node['timestamp'] = Time.now.strftime(format)
|
183
181
|
else
|
184
|
-
lp_node
|
182
|
+
lp_node['timestamp'] = '*'
|
185
183
|
end
|
186
184
|
end
|
187
185
|
|
@@ -200,7 +198,7 @@ module Opener
|
|
200
198
|
formatter.compact = true
|
201
199
|
formatter.write(doc, out)
|
202
200
|
|
203
|
-
|
201
|
+
out.strip
|
204
202
|
end
|
205
203
|
|
206
204
|
protected
|
@@ -212,11 +210,11 @@ module Opener
|
|
212
210
|
parent_node = parent
|
213
211
|
end
|
214
212
|
|
215
|
-
node =
|
213
|
+
node = Nokogiri::XML::Element.new(tag, document)
|
216
214
|
|
217
|
-
parent_node.
|
215
|
+
parent_node.add_child node
|
218
216
|
|
219
|
-
|
217
|
+
node
|
220
218
|
end
|
221
219
|
|
222
220
|
##
|
@@ -224,7 +222,7 @@ module Opener
|
|
224
222
|
# @return [Boolean]
|
225
223
|
#
|
226
224
|
def is_kaf?
|
227
|
-
|
225
|
+
!!document.at_xpath('KAF')
|
228
226
|
end
|
229
227
|
|
230
228
|
##
|
@@ -17,10 +17,11 @@ module Opener
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def [] **params
|
20
|
+
existing = @cache[params]
|
21
|
+
return existing if existing and existing.from > UPDATE_INTERVAL.ago
|
22
|
+
|
20
23
|
synchronize do
|
21
|
-
|
22
|
-
break existing if existing and existing.from > UPDATE_INTERVAL.ago
|
23
|
-
@cache[params] = cache_update existing, **params
|
24
|
+
@cache[params] = cache_update @cache[params], **params
|
24
25
|
end
|
25
26
|
end
|
26
27
|
alias_method :get, :[]
|
@@ -49,7 +50,10 @@ module Opener
|
|
49
50
|
lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
|
50
51
|
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
51
52
|
lexicons.each do |l|
|
52
|
-
mapping[l.lemma.to_sym] << l
|
53
|
+
mapping[l.lemma.to_sym] << l
|
54
|
+
l.variants&.each do |v|
|
55
|
+
mapping[v.lemma.to_sym] << l
|
56
|
+
end
|
53
57
|
end
|
54
58
|
|
55
59
|
mapping
|
@@ -28,7 +28,7 @@ Gem::Specification.new do |gem|
|
|
28
28
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
29
29
|
gem.add_dependency 'opener-core', '~> 2.2'
|
30
30
|
|
31
|
-
gem.add_dependency '
|
31
|
+
gem.add_dependency 'nokogiri'
|
32
32
|
gem.add_dependency 'httpclient'
|
33
33
|
gem.add_dependency 'hashie'
|
34
34
|
gem.add_dependency 'activesupport'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
@@ -53,25 +53,19 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '2.2'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: nokogiri
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "~>"
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '1.0'
|
62
59
|
- - ">="
|
63
60
|
- !ruby/object:Gem::Version
|
64
|
-
version:
|
61
|
+
version: '0'
|
65
62
|
type: :runtime
|
66
63
|
prerelease: false
|
67
64
|
version_requirements: !ruby/object:Gem::Requirement
|
68
65
|
requirements:
|
69
|
-
- - "~>"
|
70
|
-
- !ruby/object:Gem::Version
|
71
|
-
version: '1.0'
|
72
66
|
- - ">="
|
73
67
|
- !ruby/object:Gem::Version
|
74
|
-
version:
|
68
|
+
version: '0'
|
75
69
|
- !ruby/object:Gem::Dependency
|
76
70
|
name: httpclient
|
77
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -173,9 +167,9 @@ dependencies:
|
|
173
167
|
description: Property tagger for hotels in Dutch and English.
|
174
168
|
email:
|
175
169
|
executables:
|
176
|
-
- property-tagger-server
|
177
170
|
- property-tagger
|
178
171
|
- property-tagger-daemon
|
172
|
+
- property-tagger-server
|
179
173
|
extensions: []
|
180
174
|
extra_rdoc_files: []
|
181
175
|
files:
|