opener-property-tagger 3.3.6 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0bdc478005838e629b52e310437c5cac89158b8f33429d477c7096f1210c3249
4
- data.tar.gz: a8fec4e6229cfd39f1cc0f11c3a8007c2c40442fd6ed2f00a558d2cbd8f01c8d
3
+ metadata.gz: 6d4ff307d205b179b0914dc2f4d5278dcf9d345586bd2c1917e76f4dcfe6bb87
4
+ data.tar.gz: 459672172d0eac97a2b00784c63fe1c8375e06e6be07e1049611ee589ed758e9
5
5
  SHA512:
6
- metadata.gz: a88d97bd1fff6f57c956024d7d47affe13179a02e068129ddda1be0c577324eafee18c271e465d68dc06a99d7cdd78288e6094f307ab4503ee68c963e618bafc
7
- data.tar.gz: 6e601c83e09931821e6d00c03114fbaabb72f8c72781f667eea384b7beb916adc3ccc42f83dbd15434779e2dd3e5f574bcb8ef7be022f3154f0afdabec0107b2
6
+ metadata.gz: 69792d8309041f86c67dd36f9c669c579f6891a8304a95db30879ee94c8e19a112ba3f1cc61d83350c8cbb7a4fac130628ac0a47ce4635afe5b79e6a42d11bb7
7
+ data.tar.gz: 7c261405d53473cd1c0785dcd1e7d19851c2491b8970a330cf8b97b3943e297ada8c3ee41d5395d1d2f4de7b40e76f2e58493a7dbdb5200129465e2087e65bb8
@@ -33,12 +33,17 @@ module Opener
33
33
  # @param [String] path
34
34
  #
35
35
  def load_aspects(path)
36
- mapping = Hash.new { |hash, key| hash[key] = [] }
36
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
37
37
 
38
- File.foreach(path) do |line|
39
- lemma, _pos, aspect = line.chomp.split("\t")
38
+ File.foreach path do |line|
39
+ lemma, pos, aspect = line.chomp.split("\t")
40
+ l = Hashie::Mash.new(
41
+ lemma: lemma,
42
+ pos: pos,
43
+ aspect: aspect,
44
+ )
40
45
 
41
- mapping[lemma.to_sym] << aspect
46
+ mapping[l.lemma.to_sym] << l
42
47
  end
43
48
 
44
49
  return mapping
@@ -6,7 +6,8 @@ module Opener
6
6
  class Processor
7
7
 
8
8
  attr_accessor :document
9
- attr_accessor :aspects, :aspects_path, :aspects_url
9
+ attr_accessor :aspects_path, :aspects_url
10
+ attr_accessor :aspects, :lexicons
10
11
  attr_accessor :timestamp, :pretty
11
12
 
12
13
  ##
@@ -34,10 +35,10 @@ module Opener
34
35
  @remote = !url.nil?
35
36
  @aspects_path = path
36
37
  @aspects_url = url
37
- @cache_keys = params[:cache_keys]
38
+ @cache_keys = params[:cache_keys] || {}
38
39
  @cache_keys.merge! lang: @document.root.attr('xml:lang')
39
40
 
40
- @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
+ @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
42
  end
42
43
 
43
44
  ##
@@ -45,15 +46,13 @@ module Opener
45
46
  # @return [String]
46
47
  #
47
48
  def process
48
- existing_aspects = extract_aspects
49
-
50
49
  add_features_layer
51
50
  add_properties_layer
52
51
 
53
- existing_aspects.each_with_index do |(key, value), index|
52
+ extract_aspects.each.with_index do |(lemma, values), index|
54
53
  index += 1
55
54
 
56
- add_property(key, value, index)
55
+ add_property lemma, values, index
57
56
  end
58
57
 
59
58
  add_linguistic_processor
@@ -77,37 +76,41 @@ module Opener
77
76
  @terms
78
77
  end
79
78
 
79
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
80
+ # lemmas) belong to a property.
81
+ MAX_NGRAM = 2
82
+
80
83
  ##
81
84
  # Check which terms belong to an aspect (property)
82
85
  # Text have priority over Lemmas, overriding if there is a conflict
83
86
  # @return [Hash]
84
87
  #
85
88
  def extract_aspects
86
- term_ids = terms.keys
89
+ all_term_ids = terms.keys
87
90
  lemmas = terms.values
88
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
91
+ uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
89
92
 
90
93
  [:lemma, :text].each do |k|
91
94
  current_token = 0
92
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
93
- # lemmas) belong to a property.
94
- max_ngram = 2
95
-
96
95
 
97
96
  while current_token < terms.count
98
- (0..max_ngram).each do |tam_ngram|
99
- if current_token + tam_ngram <= terms.count
100
- ngram = lemmas[current_token..current_token+tam_ngram].map{|a| a[k] }.join(" ").downcase
101
-
102
- if aspects[ngram.to_sym]
103
- properties = aspects[ngram.to_sym]
104
- ids = term_ids[current_token..current_token+tam_ngram]
105
-
106
- properties.uniq.each do |property|
107
- next if !property or property.strip.empty?
108
-
109
- uniq_aspects[property.to_sym] << [ids,ngram] unless uniq_aspects[property.to_sym].include? [ids,ngram]
110
- end
97
+ (0..MAX_NGRAM).each do |tam_ngram|
98
+ next unless current_token + tam_ngram <= terms.count
99
+
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
101
+
102
+ @lexicons[ngram.to_sym]&.each do |l|
103
+ properties = if l.aspects.present? then l.aspects else [l.aspect] end
104
+ properties.each do |p|
105
+ next if p.blank?
106
+ term_ids = all_term_ids[current_token..current_token+tam_ngram]
107
+ next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
108
+
109
+ uniq_aspects[p.to_sym] << Hashie::Mash.new(
110
+ term_ids: term_ids,
111
+ ngram: ngram,
112
+ lexicon: l,
113
+ )
111
114
  end
112
115
  end
113
116
  end
@@ -135,24 +138,25 @@ module Opener
135
138
  new_node("properties", "KAF/features")
136
139
  end
137
140
 
138
- def add_property(key, value, index)
141
+ def add_property lemma, values, index
139
142
  property_node = new_node("property", "KAF/features/properties")
140
143
 
141
- property_node['lemma'] = key.to_s
144
+ property_node['lemma'] = lemma.to_s
142
145
  property_node['pid'] = "p#{index.to_s}"
143
146
 
144
147
  references_node = new_node("references", property_node)
145
148
 
146
- value.uniq.each do |v|
147
- comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.last} ")
149
+ values.each do |v|
150
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
148
151
  references_node.add_child comm_node
149
152
 
150
- span_node = new_node("span", references_node)
153
+ span_node = new_node 'span', references_node
151
154
 
152
- v.first.each do |val|
153
- target_node = new_node("target", span_node)
155
+ v.term_ids.each do |id|
156
+ target_node = new_node 'target', span_node
154
157
 
155
- target_node['id'] = val.to_s
158
+ target_node['id'] = id.to_s
159
+ target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
156
160
  end
157
161
  end
158
162
  end
@@ -49,7 +49,10 @@ module Opener
49
49
  lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
50
50
  mapping = Hash.new{ |hash, key| hash[key] = [] }
51
51
  lexicons.each do |l|
52
- mapping[l.lemma.to_sym] << l.aspect
52
+ mapping[l.lemma.to_sym] << l
53
+ l.variants&.each do |v|
54
+ mapping[v.lemma.to_sym] << l
55
+ end
53
56
  end
54
57
 
55
58
  mapping
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.6'
4
+ VERSION = '3.4.0'
5
5
 
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.6
4
+ version: 3.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-12-10 00:00:00.000000000 Z
11
+ date: 2021-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons