opener-property-tagger 3.3.5 → 3.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 181a83398a21579d625df2d6dfbb5e8d03759e2e285c40f3e6fe519f73f270b7
|
4
|
+
data.tar.gz: f2548258178f538a8f7e32e0617fd5f8bea8399373399529fab6316906739543
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e3817f4e33c9aedc67d767e4e194a76a7b3d5f87331e7722916dd3f84c4475f51f04763a9dfc7561092eab623aeee2f90c2ab7b4b2a8caf2b99e34708ebe3995
|
7
|
+
data.tar.gz: bb42edeff3f5225972d475c4522da2b91b99484edb94d2a14b646ea7712964e0f22858819e7b0affb4b59ecfd87f4823035ab13d804a76c4b377684bfb7ea7c1
|
@@ -33,12 +33,17 @@ module Opener
|
|
33
33
|
# @param [String] path
|
34
34
|
#
|
35
35
|
def load_aspects(path)
|
36
|
-
mapping = Hash.new
|
36
|
+
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
37
37
|
|
38
|
-
File.foreach
|
39
|
-
lemma,
|
38
|
+
File.foreach path do |line|
|
39
|
+
lemma, pos, aspect = line.chomp.split("\t")
|
40
|
+
l = Hashie::Mash.new(
|
41
|
+
lemma: lemma,
|
42
|
+
pos: pos,
|
43
|
+
aspect: aspect,
|
44
|
+
)
|
40
45
|
|
41
|
-
mapping[lemma.to_sym] <<
|
46
|
+
mapping[l.lemma.to_sym] << l
|
42
47
|
end
|
43
48
|
|
44
49
|
return mapping
|
@@ -6,7 +6,8 @@ module Opener
|
|
6
6
|
class Processor
|
7
7
|
|
8
8
|
attr_accessor :document
|
9
|
-
attr_accessor :
|
9
|
+
attr_accessor :aspects_path, :aspects_url
|
10
|
+
attr_accessor :aspects, :lexicons
|
10
11
|
attr_accessor :timestamp, :pretty
|
11
12
|
|
12
13
|
##
|
@@ -31,12 +32,13 @@ module Opener
|
|
31
32
|
@pretty = pretty
|
32
33
|
|
33
34
|
@params = params
|
34
|
-
@cache_keys = params[:cache_keys] || {lang: language}
|
35
35
|
@remote = !url.nil?
|
36
36
|
@aspects_path = path
|
37
37
|
@aspects_url = url
|
38
|
+
@cache_keys = params[:cache_keys] || {}
|
39
|
+
@cache_keys.merge! lang: @document.root.attr('xml:lang')
|
38
40
|
|
39
|
-
@
|
41
|
+
@lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
|
40
42
|
end
|
41
43
|
|
42
44
|
##
|
@@ -44,15 +46,13 @@ module Opener
|
|
44
46
|
# @return [String]
|
45
47
|
#
|
46
48
|
def process
|
47
|
-
existing_aspects = extract_aspects
|
48
|
-
|
49
49
|
add_features_layer
|
50
50
|
add_properties_layer
|
51
51
|
|
52
|
-
|
52
|
+
extract_aspects.each.with_index do |(lemma, values), index|
|
53
53
|
index += 1
|
54
54
|
|
55
|
-
add_property
|
55
|
+
add_property lemma, values, index
|
56
56
|
end
|
57
57
|
|
58
58
|
add_linguistic_processor
|
@@ -76,36 +76,41 @@ module Opener
|
|
76
76
|
@terms
|
77
77
|
end
|
78
78
|
|
79
|
+
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
80
|
+
# lemmas) belong to a property.
|
81
|
+
MAX_NGRAM = 2
|
82
|
+
|
79
83
|
##
|
80
84
|
# Check which terms belong to an aspect (property)
|
85
|
+
# Text have priority over Lemmas, overriding if there is a conflict
|
81
86
|
# @return [Hash]
|
82
87
|
#
|
83
88
|
def extract_aspects
|
84
|
-
|
89
|
+
all_term_ids = terms.keys
|
85
90
|
lemmas = terms.values
|
86
|
-
uniq_aspects = Hash.new
|
91
|
+
uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
|
87
92
|
|
88
93
|
[:lemma, :text].each do |k|
|
89
94
|
current_token = 0
|
90
|
-
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
91
|
-
# lemmas) belong to a property.
|
92
|
-
max_ngram = 2
|
93
|
-
|
94
95
|
|
95
96
|
while current_token < terms.count
|
96
|
-
(0..
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
97
|
+
(0..MAX_NGRAM).each do |tam_ngram|
|
98
|
+
next unless current_token + tam_ngram <= terms.count
|
99
|
+
|
100
|
+
ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
|
101
|
+
|
102
|
+
@lexicons[ngram.to_sym]&.each do |l|
|
103
|
+
properties = if l.aspects.present? then l.aspects else [l.aspect] end
|
104
|
+
properties.each do |p|
|
105
|
+
next if p.blank?
|
106
|
+
term_ids = all_term_ids[current_token..current_token+tam_ngram]
|
107
|
+
next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
|
108
|
+
|
109
|
+
uniq_aspects[p.to_sym] << Hashie::Mash.new(
|
110
|
+
term_ids: term_ids,
|
111
|
+
ngram: ngram,
|
112
|
+
lexicon: l,
|
113
|
+
)
|
109
114
|
end
|
110
115
|
end
|
111
116
|
end
|
@@ -133,24 +138,25 @@ module Opener
|
|
133
138
|
new_node("properties", "KAF/features")
|
134
139
|
end
|
135
140
|
|
136
|
-
def add_property
|
141
|
+
def add_property lemma, values, index
|
137
142
|
property_node = new_node("property", "KAF/features/properties")
|
138
143
|
|
139
|
-
property_node['lemma'] =
|
144
|
+
property_node['lemma'] = lemma.to_s
|
140
145
|
property_node['pid'] = "p#{index.to_s}"
|
141
146
|
|
142
147
|
references_node = new_node("references", property_node)
|
143
148
|
|
144
|
-
|
145
|
-
comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.
|
149
|
+
values.each do |v|
|
150
|
+
comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
|
146
151
|
references_node.add_child comm_node
|
147
152
|
|
148
|
-
span_node = new_node
|
153
|
+
span_node = new_node 'span', references_node
|
149
154
|
|
150
|
-
v.
|
151
|
-
target_node = new_node
|
155
|
+
v.term_ids.each do |id|
|
156
|
+
target_node = new_node 'target', span_node
|
152
157
|
|
153
|
-
target_node['id'] =
|
158
|
+
target_node['id'] = id.to_s
|
159
|
+
target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
|
154
160
|
end
|
155
161
|
end
|
156
162
|
end
|
@@ -17,10 +17,12 @@ module Opener
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def [] **params
|
20
|
+
existing = @cache[params]
|
21
|
+
return existing if existing and existing.from > UPDATE_INTERVAL.ago
|
22
|
+
params[:contract_ids] = nil unless params[:contract_ids]
|
23
|
+
|
20
24
|
synchronize do
|
21
|
-
|
22
|
-
break existing if existing and existing.from > UPDATE_INTERVAL.ago
|
23
|
-
@cache[params] = cache_update existing, **params
|
25
|
+
@cache[params] = cache_update @cache[params], **params
|
24
26
|
end
|
25
27
|
end
|
26
28
|
alias_method :get, :[]
|
@@ -49,7 +51,10 @@ module Opener
|
|
49
51
|
lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
|
50
52
|
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
51
53
|
lexicons.each do |l|
|
52
|
-
mapping[l.lemma.to_sym] << l
|
54
|
+
mapping[l.lemma.to_sym] << l
|
55
|
+
l.variants&.each do |v|
|
56
|
+
mapping[v.lemma.to_sym] << l
|
57
|
+
end
|
53
58
|
end
|
54
59
|
|
55
60
|
mapping
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-09-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
@@ -212,8 +212,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
212
212
|
- !ruby/object:Gem::Version
|
213
213
|
version: '0'
|
214
214
|
requirements: []
|
215
|
-
|
216
|
-
rubygems_version: 2.7.6
|
215
|
+
rubygems_version: 3.2.14
|
217
216
|
signing_key:
|
218
217
|
specification_version: 4
|
219
218
|
summary: Property tagger for hotels in Dutch and English.
|