opener-property-tagger 3.3.6 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d4ff307d205b179b0914dc2f4d5278dcf9d345586bd2c1917e76f4dcfe6bb87
|
4
|
+
data.tar.gz: 459672172d0eac97a2b00784c63fe1c8375e06e6be07e1049611ee589ed758e9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 69792d8309041f86c67dd36f9c669c579f6891a8304a95db30879ee94c8e19a112ba3f1cc61d83350c8cbb7a4fac130628ac0a47ce4635afe5b79e6a42d11bb7
|
7
|
+
data.tar.gz: 7c261405d53473cd1c0785dcd1e7d19851c2491b8970a330cf8b97b3943e297ada8c3ee41d5395d1d2f4de7b40e76f2e58493a7dbdb5200129465e2087e65bb8
|
@@ -33,12 +33,17 @@ module Opener
|
|
33
33
|
# @param [String] path
|
34
34
|
#
|
35
35
|
def load_aspects(path)
|
36
|
-
mapping = Hash.new
|
36
|
+
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
37
37
|
|
38
|
-
File.foreach
|
39
|
-
lemma,
|
38
|
+
File.foreach path do |line|
|
39
|
+
lemma, pos, aspect = line.chomp.split("\t")
|
40
|
+
l = Hashie::Mash.new(
|
41
|
+
lemma: lemma,
|
42
|
+
pos: pos,
|
43
|
+
aspect: aspect,
|
44
|
+
)
|
40
45
|
|
41
|
-
mapping[lemma.to_sym] <<
|
46
|
+
mapping[l.lemma.to_sym] << l
|
42
47
|
end
|
43
48
|
|
44
49
|
return mapping
|
@@ -6,7 +6,8 @@ module Opener
|
|
6
6
|
class Processor
|
7
7
|
|
8
8
|
attr_accessor :document
|
9
|
-
attr_accessor :
|
9
|
+
attr_accessor :aspects_path, :aspects_url
|
10
|
+
attr_accessor :aspects, :lexicons
|
10
11
|
attr_accessor :timestamp, :pretty
|
11
12
|
|
12
13
|
##
|
@@ -34,10 +35,10 @@ module Opener
|
|
34
35
|
@remote = !url.nil?
|
35
36
|
@aspects_path = path
|
36
37
|
@aspects_url = url
|
37
|
-
@cache_keys = params[:cache_keys]
|
38
|
+
@cache_keys = params[:cache_keys] || {}
|
38
39
|
@cache_keys.merge! lang: @document.root.attr('xml:lang')
|
39
40
|
|
40
|
-
@
|
41
|
+
@lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
|
41
42
|
end
|
42
43
|
|
43
44
|
##
|
@@ -45,15 +46,13 @@ module Opener
|
|
45
46
|
# @return [String]
|
46
47
|
#
|
47
48
|
def process
|
48
|
-
existing_aspects = extract_aspects
|
49
|
-
|
50
49
|
add_features_layer
|
51
50
|
add_properties_layer
|
52
51
|
|
53
|
-
|
52
|
+
extract_aspects.each.with_index do |(lemma, values), index|
|
54
53
|
index += 1
|
55
54
|
|
56
|
-
add_property
|
55
|
+
add_property lemma, values, index
|
57
56
|
end
|
58
57
|
|
59
58
|
add_linguistic_processor
|
@@ -77,37 +76,41 @@ module Opener
|
|
77
76
|
@terms
|
78
77
|
end
|
79
78
|
|
79
|
+
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
80
|
+
# lemmas) belong to a property.
|
81
|
+
MAX_NGRAM = 2
|
82
|
+
|
80
83
|
##
|
81
84
|
# Check which terms belong to an aspect (property)
|
82
85
|
# Text have priority over Lemmas, overriding if there is a conflict
|
83
86
|
# @return [Hash]
|
84
87
|
#
|
85
88
|
def extract_aspects
|
86
|
-
|
89
|
+
all_term_ids = terms.keys
|
87
90
|
lemmas = terms.values
|
88
|
-
uniq_aspects = Hash.new
|
91
|
+
uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
|
89
92
|
|
90
93
|
[:lemma, :text].each do |k|
|
91
94
|
current_token = 0
|
92
|
-
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
93
|
-
# lemmas) belong to a property.
|
94
|
-
max_ngram = 2
|
95
|
-
|
96
95
|
|
97
96
|
while current_token < terms.count
|
98
|
-
(0..
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
97
|
+
(0..MAX_NGRAM).each do |tam_ngram|
|
98
|
+
next unless current_token + tam_ngram <= terms.count
|
99
|
+
|
100
|
+
ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
|
101
|
+
|
102
|
+
@lexicons[ngram.to_sym]&.each do |l|
|
103
|
+
properties = if l.aspects.present? then l.aspects else [l.aspect] end
|
104
|
+
properties.each do |p|
|
105
|
+
next if p.blank?
|
106
|
+
term_ids = all_term_ids[current_token..current_token+tam_ngram]
|
107
|
+
next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
|
108
|
+
|
109
|
+
uniq_aspects[p.to_sym] << Hashie::Mash.new(
|
110
|
+
term_ids: term_ids,
|
111
|
+
ngram: ngram,
|
112
|
+
lexicon: l,
|
113
|
+
)
|
111
114
|
end
|
112
115
|
end
|
113
116
|
end
|
@@ -135,24 +138,25 @@ module Opener
|
|
135
138
|
new_node("properties", "KAF/features")
|
136
139
|
end
|
137
140
|
|
138
|
-
def add_property
|
141
|
+
def add_property lemma, values, index
|
139
142
|
property_node = new_node("property", "KAF/features/properties")
|
140
143
|
|
141
|
-
property_node['lemma'] =
|
144
|
+
property_node['lemma'] = lemma.to_s
|
142
145
|
property_node['pid'] = "p#{index.to_s}"
|
143
146
|
|
144
147
|
references_node = new_node("references", property_node)
|
145
148
|
|
146
|
-
|
147
|
-
comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.
|
149
|
+
values.each do |v|
|
150
|
+
comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
|
148
151
|
references_node.add_child comm_node
|
149
152
|
|
150
|
-
span_node = new_node
|
153
|
+
span_node = new_node 'span', references_node
|
151
154
|
|
152
|
-
v.
|
153
|
-
target_node = new_node
|
155
|
+
v.term_ids.each do |id|
|
156
|
+
target_node = new_node 'target', span_node
|
154
157
|
|
155
|
-
target_node['id'] =
|
158
|
+
target_node['id'] = id.to_s
|
159
|
+
target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
|
156
160
|
end
|
157
161
|
end
|
158
162
|
end
|
@@ -49,7 +49,10 @@ module Opener
|
|
49
49
|
lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
|
50
50
|
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
51
51
|
lexicons.each do |l|
|
52
|
-
mapping[l.lemma.to_sym] << l
|
52
|
+
mapping[l.lemma.to_sym] << l
|
53
|
+
l.variants&.each do |v|
|
54
|
+
mapping[v.lemma.to_sym] << l
|
55
|
+
end
|
53
56
|
end
|
54
57
|
|
55
58
|
mapping
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|