opener-property-tagger 3.3.6 → 3.4.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d4ff307d205b179b0914dc2f4d5278dcf9d345586bd2c1917e76f4dcfe6bb87
|
4
|
+
data.tar.gz: 459672172d0eac97a2b00784c63fe1c8375e06e6be07e1049611ee589ed758e9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 69792d8309041f86c67dd36f9c669c579f6891a8304a95db30879ee94c8e19a112ba3f1cc61d83350c8cbb7a4fac130628ac0a47ce4635afe5b79e6a42d11bb7
|
7
|
+
data.tar.gz: 7c261405d53473cd1c0785dcd1e7d19851c2491b8970a330cf8b97b3943e297ada8c3ee41d5395d1d2f4de7b40e76f2e58493a7dbdb5200129465e2087e65bb8
|
@@ -33,12 +33,17 @@ module Opener
|
|
33
33
|
# @param [String] path
|
34
34
|
#
|
35
35
|
def load_aspects(path)
|
36
|
-
mapping = Hash.new
|
36
|
+
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
37
37
|
|
38
|
-
File.foreach
|
39
|
-
lemma,
|
38
|
+
File.foreach path do |line|
|
39
|
+
lemma, pos, aspect = line.chomp.split("\t")
|
40
|
+
l = Hashie::Mash.new(
|
41
|
+
lemma: lemma,
|
42
|
+
pos: pos,
|
43
|
+
aspect: aspect,
|
44
|
+
)
|
40
45
|
|
41
|
-
mapping[lemma.to_sym] <<
|
46
|
+
mapping[l.lemma.to_sym] << l
|
42
47
|
end
|
43
48
|
|
44
49
|
return mapping
|
@@ -6,7 +6,8 @@ module Opener
|
|
6
6
|
class Processor
|
7
7
|
|
8
8
|
attr_accessor :document
|
9
|
-
attr_accessor :
|
9
|
+
attr_accessor :aspects_path, :aspects_url
|
10
|
+
attr_accessor :aspects, :lexicons
|
10
11
|
attr_accessor :timestamp, :pretty
|
11
12
|
|
12
13
|
##
|
@@ -34,10 +35,10 @@ module Opener
|
|
34
35
|
@remote = !url.nil?
|
35
36
|
@aspects_path = path
|
36
37
|
@aspects_url = url
|
37
|
-
@cache_keys = params[:cache_keys]
|
38
|
+
@cache_keys = params[:cache_keys] || {}
|
38
39
|
@cache_keys.merge! lang: @document.root.attr('xml:lang')
|
39
40
|
|
40
|
-
@
|
41
|
+
@lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
|
41
42
|
end
|
42
43
|
|
43
44
|
##
|
@@ -45,15 +46,13 @@ module Opener
|
|
45
46
|
# @return [String]
|
46
47
|
#
|
47
48
|
def process
|
48
|
-
existing_aspects = extract_aspects
|
49
|
-
|
50
49
|
add_features_layer
|
51
50
|
add_properties_layer
|
52
51
|
|
53
|
-
|
52
|
+
extract_aspects.each.with_index do |(lemma, values), index|
|
54
53
|
index += 1
|
55
54
|
|
56
|
-
add_property
|
55
|
+
add_property lemma, values, index
|
57
56
|
end
|
58
57
|
|
59
58
|
add_linguistic_processor
|
@@ -77,37 +76,41 @@ module Opener
|
|
77
76
|
@terms
|
78
77
|
end
|
79
78
|
|
79
|
+
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
80
|
+
# lemmas) belong to a property.
|
81
|
+
MAX_NGRAM = 2
|
82
|
+
|
80
83
|
##
|
81
84
|
# Check which terms belong to an aspect (property)
|
82
85
|
# Text have priority over Lemmas, overriding if there is a conflict
|
83
86
|
# @return [Hash]
|
84
87
|
#
|
85
88
|
def extract_aspects
|
86
|
-
|
89
|
+
all_term_ids = terms.keys
|
87
90
|
lemmas = terms.values
|
88
|
-
uniq_aspects = Hash.new
|
91
|
+
uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
|
89
92
|
|
90
93
|
[:lemma, :text].each do |k|
|
91
94
|
current_token = 0
|
92
|
-
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
93
|
-
# lemmas) belong to a property.
|
94
|
-
max_ngram = 2
|
95
|
-
|
96
95
|
|
97
96
|
while current_token < terms.count
|
98
|
-
(0..
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
97
|
+
(0..MAX_NGRAM).each do |tam_ngram|
|
98
|
+
next unless current_token + tam_ngram <= terms.count
|
99
|
+
|
100
|
+
ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
|
101
|
+
|
102
|
+
@lexicons[ngram.to_sym]&.each do |l|
|
103
|
+
properties = if l.aspects.present? then l.aspects else [l.aspect] end
|
104
|
+
properties.each do |p|
|
105
|
+
next if p.blank?
|
106
|
+
term_ids = all_term_ids[current_token..current_token+tam_ngram]
|
107
|
+
next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
|
108
|
+
|
109
|
+
uniq_aspects[p.to_sym] << Hashie::Mash.new(
|
110
|
+
term_ids: term_ids,
|
111
|
+
ngram: ngram,
|
112
|
+
lexicon: l,
|
113
|
+
)
|
111
114
|
end
|
112
115
|
end
|
113
116
|
end
|
@@ -135,24 +138,25 @@ module Opener
|
|
135
138
|
new_node("properties", "KAF/features")
|
136
139
|
end
|
137
140
|
|
138
|
-
def add_property
|
141
|
+
def add_property lemma, values, index
|
139
142
|
property_node = new_node("property", "KAF/features/properties")
|
140
143
|
|
141
|
-
property_node['lemma'] =
|
144
|
+
property_node['lemma'] = lemma.to_s
|
142
145
|
property_node['pid'] = "p#{index.to_s}"
|
143
146
|
|
144
147
|
references_node = new_node("references", property_node)
|
145
148
|
|
146
|
-
|
147
|
-
comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.
|
149
|
+
values.each do |v|
|
150
|
+
comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
|
148
151
|
references_node.add_child comm_node
|
149
152
|
|
150
|
-
span_node = new_node
|
153
|
+
span_node = new_node 'span', references_node
|
151
154
|
|
152
|
-
v.
|
153
|
-
target_node = new_node
|
155
|
+
v.term_ids.each do |id|
|
156
|
+
target_node = new_node 'target', span_node
|
154
157
|
|
155
|
-
target_node['id'] =
|
158
|
+
target_node['id'] = id.to_s
|
159
|
+
target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
|
156
160
|
end
|
157
161
|
end
|
158
162
|
end
|
@@ -49,7 +49,10 @@ module Opener
|
|
49
49
|
lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
|
50
50
|
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
51
51
|
lexicons.each do |l|
|
52
|
-
mapping[l.lemma.to_sym] << l
|
52
|
+
mapping[l.lemma.to_sym] << l
|
53
|
+
l.variants&.each do |v|
|
54
|
+
mapping[v.lemma.to_sym] << l
|
55
|
+
end
|
53
56
|
end
|
54
57
|
|
55
58
|
mapping
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|