opener-property-tagger 3.3.6 → 3.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0bdc478005838e629b52e310437c5cac89158b8f33429d477c7096f1210c3249
4
- data.tar.gz: a8fec4e6229cfd39f1cc0f11c3a8007c2c40442fd6ed2f00a558d2cbd8f01c8d
3
+ metadata.gz: 6d4ff307d205b179b0914dc2f4d5278dcf9d345586bd2c1917e76f4dcfe6bb87
4
+ data.tar.gz: 459672172d0eac97a2b00784c63fe1c8375e06e6be07e1049611ee589ed758e9
5
5
  SHA512:
6
- metadata.gz: a88d97bd1fff6f57c956024d7d47affe13179a02e068129ddda1be0c577324eafee18c271e465d68dc06a99d7cdd78288e6094f307ab4503ee68c963e618bafc
7
- data.tar.gz: 6e601c83e09931821e6d00c03114fbaabb72f8c72781f667eea384b7beb916adc3ccc42f83dbd15434779e2dd3e5f574bcb8ef7be022f3154f0afdabec0107b2
6
+ metadata.gz: 69792d8309041f86c67dd36f9c669c579f6891a8304a95db30879ee94c8e19a112ba3f1cc61d83350c8cbb7a4fac130628ac0a47ce4635afe5b79e6a42d11bb7
7
+ data.tar.gz: 7c261405d53473cd1c0785dcd1e7d19851c2491b8970a330cf8b97b3943e297ada8c3ee41d5395d1d2f4de7b40e76f2e58493a7dbdb5200129465e2087e65bb8
@@ -33,12 +33,17 @@ module Opener
33
33
  # @param [String] path
34
34
  #
35
35
  def load_aspects(path)
36
- mapping = Hash.new { |hash, key| hash[key] = [] }
36
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
37
37
 
38
- File.foreach(path) do |line|
39
- lemma, _pos, aspect = line.chomp.split("\t")
38
+ File.foreach path do |line|
39
+ lemma, pos, aspect = line.chomp.split("\t")
40
+ l = Hashie::Mash.new(
41
+ lemma: lemma,
42
+ pos: pos,
43
+ aspect: aspect,
44
+ )
40
45
 
41
- mapping[lemma.to_sym] << aspect
46
+ mapping[l.lemma.to_sym] << l
42
47
  end
43
48
 
44
49
  return mapping
@@ -6,7 +6,8 @@ module Opener
6
6
  class Processor
7
7
 
8
8
  attr_accessor :document
9
- attr_accessor :aspects, :aspects_path, :aspects_url
9
+ attr_accessor :aspects_path, :aspects_url
10
+ attr_accessor :aspects, :lexicons
10
11
  attr_accessor :timestamp, :pretty
11
12
 
12
13
  ##
@@ -34,10 +35,10 @@ module Opener
34
35
  @remote = !url.nil?
35
36
  @aspects_path = path
36
37
  @aspects_url = url
37
- @cache_keys = params[:cache_keys]
38
+ @cache_keys = params[:cache_keys] || {}
38
39
  @cache_keys.merge! lang: @document.root.attr('xml:lang')
39
40
 
40
- @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
+ @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
42
  end
42
43
 
43
44
  ##
@@ -45,15 +46,13 @@ module Opener
45
46
  # @return [String]
46
47
  #
47
48
  def process
48
- existing_aspects = extract_aspects
49
-
50
49
  add_features_layer
51
50
  add_properties_layer
52
51
 
53
- existing_aspects.each_with_index do |(key, value), index|
52
+ extract_aspects.each.with_index do |(lemma, values), index|
54
53
  index += 1
55
54
 
56
- add_property(key, value, index)
55
+ add_property lemma, values, index
57
56
  end
58
57
 
59
58
  add_linguistic_processor
@@ -77,37 +76,41 @@ module Opener
77
76
  @terms
78
77
  end
79
78
 
79
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
80
+ # lemmas) belong to a property.
81
+ MAX_NGRAM = 2
82
+
80
83
  ##
81
84
  # Check which terms belong to an aspect (property)
82
85
  # Text have priority over Lemmas, overriding if there is a conflict
83
86
  # @return [Hash]
84
87
  #
85
88
  def extract_aspects
86
- term_ids = terms.keys
89
+ all_term_ids = terms.keys
87
90
  lemmas = terms.values
88
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
91
+ uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
89
92
 
90
93
  [:lemma, :text].each do |k|
91
94
  current_token = 0
92
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
93
- # lemmas) belong to a property.
94
- max_ngram = 2
95
-
96
95
 
97
96
  while current_token < terms.count
98
- (0..max_ngram).each do |tam_ngram|
99
- if current_token + tam_ngram <= terms.count
100
- ngram = lemmas[current_token..current_token+tam_ngram].map{|a| a[k] }.join(" ").downcase
101
-
102
- if aspects[ngram.to_sym]
103
- properties = aspects[ngram.to_sym]
104
- ids = term_ids[current_token..current_token+tam_ngram]
105
-
106
- properties.uniq.each do |property|
107
- next if !property or property.strip.empty?
108
-
109
- uniq_aspects[property.to_sym] << [ids,ngram] unless uniq_aspects[property.to_sym].include? [ids,ngram]
110
- end
97
+ (0..MAX_NGRAM).each do |tam_ngram|
98
+ next unless current_token + tam_ngram <= terms.count
99
+
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
101
+
102
+ @lexicons[ngram.to_sym]&.each do |l|
103
+ properties = if l.aspects.present? then l.aspects else [l.aspect] end
104
+ properties.each do |p|
105
+ next if p.blank?
106
+ term_ids = all_term_ids[current_token..current_token+tam_ngram]
107
+ next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
108
+
109
+ uniq_aspects[p.to_sym] << Hashie::Mash.new(
110
+ term_ids: term_ids,
111
+ ngram: ngram,
112
+ lexicon: l,
113
+ )
111
114
  end
112
115
  end
113
116
  end
@@ -135,24 +138,25 @@ module Opener
135
138
  new_node("properties", "KAF/features")
136
139
  end
137
140
 
138
- def add_property(key, value, index)
141
+ def add_property lemma, values, index
139
142
  property_node = new_node("property", "KAF/features/properties")
140
143
 
141
- property_node['lemma'] = key.to_s
144
+ property_node['lemma'] = lemma.to_s
142
145
  property_node['pid'] = "p#{index.to_s}"
143
146
 
144
147
  references_node = new_node("references", property_node)
145
148
 
146
- value.uniq.each do |v|
147
- comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.last} ")
149
+ values.each do |v|
150
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
148
151
  references_node.add_child comm_node
149
152
 
150
- span_node = new_node("span", references_node)
153
+ span_node = new_node 'span', references_node
151
154
 
152
- v.first.each do |val|
153
- target_node = new_node("target", span_node)
155
+ v.term_ids.each do |id|
156
+ target_node = new_node 'target', span_node
154
157
 
155
- target_node['id'] = val.to_s
158
+ target_node['id'] = id.to_s
159
+ target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
156
160
  end
157
161
  end
158
162
  end
@@ -49,7 +49,10 @@ module Opener
49
49
  lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
50
50
  mapping = Hash.new{ |hash, key| hash[key] = [] }
51
51
  lexicons.each do |l|
52
- mapping[l.lemma.to_sym] << l.aspect
52
+ mapping[l.lemma.to_sym] << l
53
+ l.variants&.each do |v|
54
+ mapping[v.lemma.to_sym] << l
55
+ end
53
56
  end
54
57
 
55
58
  mapping
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.6'
4
+ VERSION = '3.4.0'
5
5
 
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.6
4
+ version: 3.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-12-10 00:00:00.000000000 Z
11
+ date: 2021-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons