opener-property-tagger 3.3.2 → 3.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6483ef2a01a413e280ed9555566499496748cb0b8c25ddd9853fb50165ab03f9
4
- data.tar.gz: eb0f85d3d82aed282d21a3b51a0c75867b62c547b49fd1ceeaac374baff86111
3
+ metadata.gz: 6d4ff307d205b179b0914dc2f4d5278dcf9d345586bd2c1917e76f4dcfe6bb87
4
+ data.tar.gz: 459672172d0eac97a2b00784c63fe1c8375e06e6be07e1049611ee589ed758e9
5
5
  SHA512:
6
- metadata.gz: fe31cb493644de7eaed474d9064e3f8880f229b8e05f0685aae18e730f5cd3d94c92aa403966904147b710f0e9589cffa840cf44ac83d64fc7f9c9211ffc6313
7
- data.tar.gz: '0939965101de5ccd503bbdeae6b511552354d8f285e263b739b5d5f004ffdf03411fac5b4a97404f728bd1038f8320843c2dcb2f8ae70059eedc44e0682c4d71'
6
+ metadata.gz: 69792d8309041f86c67dd36f9c669c579f6891a8304a95db30879ee94c8e19a112ba3f1cc61d83350c8cbb7a4fac130628ac0a47ce4635afe5b79e6a42d11bb7
7
+ data.tar.gz: 7c261405d53473cd1c0785dcd1e7d19851c2491b8970a330cf8b97b3943e297ada8c3ee41d5395d1d2f4de7b40e76f2e58493a7dbdb5200129465e2087e65bb8
@@ -1,6 +1,6 @@
1
1
  require 'open3'
2
2
  require 'slop'
3
- require 'oga'
3
+ require 'nokogiri'
4
4
  require 'monitor'
5
5
  require 'httpclient'
6
6
  require 'hashie'
@@ -33,12 +33,17 @@ module Opener
33
33
  # @param [String] path
34
34
  #
35
35
  def load_aspects(path)
36
- mapping = Hash.new { |hash, key| hash[key] = [] }
36
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
37
37
 
38
- File.foreach(path) do |line|
38
+ File.foreach path do |line|
39
39
  lemma, pos, aspect = line.chomp.split("\t")
40
+ l = Hashie::Mash.new(
41
+ lemma: lemma,
42
+ pos: pos,
43
+ aspect: aspect,
44
+ )
40
45
 
41
- mapping[lemma.to_sym] << aspect
46
+ mapping[l.lemma.to_sym] << l
42
47
  end
43
48
 
44
49
  return mapping
@@ -6,7 +6,8 @@ module Opener
6
6
  class Processor
7
7
 
8
8
  attr_accessor :document
9
- attr_accessor :aspects, :aspects_path, :aspects_url
9
+ attr_accessor :aspects_path, :aspects_url
10
+ attr_accessor :aspects, :lexicons
10
11
  attr_accessor :timestamp, :pretty
11
12
 
12
13
  ##
@@ -25,18 +26,19 @@ module Opener
25
26
  # by default due to the performance overhead.
26
27
  #
27
28
  def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
28
- @document = Oga.parse_xml file
29
+ @document = Nokogiri.XML file
29
30
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
30
31
  @timestamp = timestamp
31
32
  @pretty = pretty
32
33
 
33
34
  @params = params
34
- @cache_keys = params[:cache_keys] || {lang: language}
35
35
  @remote = !url.nil?
36
36
  @aspects_path = path
37
37
  @aspects_url = url
38
+ @cache_keys = params[:cache_keys] || {}
39
+ @cache_keys.merge! lang: @document.root.attr('xml:lang')
38
40
 
39
- @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
+ @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
40
42
  end
41
43
 
42
44
  ##
@@ -44,83 +46,79 @@ module Opener
44
46
  # @return [String]
45
47
  #
46
48
  def process
47
- existing_aspects = extract_aspects
48
-
49
49
  add_features_layer
50
50
  add_properties_layer
51
51
 
52
- existing_aspects.each_with_index do |(key, value), index|
52
+ extract_aspects.each.with_index do |(lemma, values), index|
53
53
  index += 1
54
54
 
55
- add_property(key, value, index)
55
+ add_property lemma, values, index
56
56
  end
57
57
 
58
58
  add_linguistic_processor
59
59
 
60
- return pretty ? pretty_print(document) : document.to_xml
60
+ pretty ? pretty_print(document) : document.to_xml
61
61
  end
62
62
 
63
- ##
64
- # Get the language of the input file.
65
- #
66
- # @return [String]
67
- #
68
63
  def language
69
- return @language ||= document.at_xpath('KAF').get('xml:lang')
64
+ @language ||= document.at_xpath('KAF').attr('xml:lang')
70
65
  end
71
66
 
72
- ##
73
- # Get the terms from the input file
74
- # @return [Hash]
75
- #
76
67
  def terms
77
68
  unless @terms
78
69
  @terms = {}
79
70
 
80
71
  document.xpath('KAF/terms/term').each do |term|
81
- @terms[term.get('tid').to_sym] = term.get('lemma')
72
+ @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
82
73
  end
83
74
  end
84
75
 
85
- return @terms
76
+ @terms
86
77
  end
87
78
 
79
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
80
+ # lemmas) belong to a property.
81
+ MAX_NGRAM = 2
82
+
88
83
  ##
89
84
  # Check which terms belong to an aspect (property)
85
+ # Text have priority over Lemmas, overriding if there is a conflict
90
86
  # @return [Hash]
91
87
  #
92
88
  def extract_aspects
93
- term_ids = terms.keys
94
- lemmas = terms.values
95
-
96
- current_token = 0
97
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
98
- # lemmas) belong to a property.
99
- max_ngram = 2
100
-
101
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
102
-
103
- while current_token < terms.count
104
- (0..max_ngram).each do |tam_ngram|
105
- if current_token + tam_ngram <= terms.count
106
- ngram = lemmas[current_token..current_token+tam_ngram].join(" ").downcase
107
-
108
- if aspects[ngram.to_sym]
109
- properties = aspects[ngram.to_sym]
110
- ids = term_ids[current_token..current_token+tam_ngram]
111
-
112
- properties.uniq.each do |property|
113
- next if !property or property.strip.empty?
114
-
115
- uniq_aspects[property.to_sym] << [ids,ngram]
89
+ all_term_ids = terms.keys
90
+ lemmas = terms.values
91
+ uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
92
+
93
+ [:lemma, :text].each do |k|
94
+ current_token = 0
95
+
96
+ while current_token < terms.count
97
+ (0..MAX_NGRAM).each do |tam_ngram|
98
+ next unless current_token + tam_ngram <= terms.count
99
+
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
101
+
102
+ @lexicons[ngram.to_sym]&.each do |l|
103
+ properties = if l.aspects.present? then l.aspects else [l.aspect] end
104
+ properties.each do |p|
105
+ next if p.blank?
106
+ term_ids = all_term_ids[current_token..current_token+tam_ngram]
107
+ next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
108
+
109
+ uniq_aspects[p.to_sym] << Hashie::Mash.new(
110
+ term_ids: term_ids,
111
+ ngram: ngram,
112
+ lexicon: l,
113
+ )
116
114
  end
117
115
  end
118
116
  end
117
+ current_token += 1
119
118
  end
120
- current_token += 1
121
119
  end
122
120
 
123
- return Hash[uniq_aspects.sort]
121
+ Hash[uniq_aspects.sort]
124
122
  end
125
123
 
126
124
  ##
@@ -140,25 +138,25 @@ module Opener
140
138
  new_node("properties", "KAF/features")
141
139
  end
142
140
 
143
- def add_property(key, value, index)
141
+ def add_property lemma, values, index
144
142
  property_node = new_node("property", "KAF/features/properties")
145
143
 
146
- property_node.set('lemma', key.to_s)
147
- property_node.set('pid', "p#{index.to_s}")
144
+ property_node['lemma'] = lemma.to_s
145
+ property_node['pid'] = "p#{index.to_s}"
148
146
 
149
147
  references_node = new_node("references", property_node)
150
148
 
151
- value.uniq.each do |v|
152
- comment = Oga::XML::Comment.new(:text => " #{v.last} ")
153
-
154
- references_node.children << comment
149
+ values.each do |v|
150
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
151
+ references_node.add_child comm_node
155
152
 
156
- span_node = new_node("span", references_node)
153
+ span_node = new_node 'span', references_node
157
154
 
158
- v.first.each do |val|
159
- target_node = new_node("target", span_node)
155
+ v.term_ids.each do |id|
156
+ target_node = new_node 'target', span_node
160
157
 
161
- target_node.set('id', val.to_s)
158
+ target_node['id'] = id.to_s
159
+ target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
162
160
  end
163
161
  end
164
162
  end
@@ -169,19 +167,19 @@ module Opener
169
167
  version = '2.0'
170
168
 
171
169
  node = new_node('linguisticProcessors', 'KAF/kafHeader')
172
- node.set('layer', 'features')
170
+ node['layer'] = 'features'
173
171
 
174
172
  lp_node = new_node('lp', node)
175
173
 
176
- lp_node.set('version', "#{last_edited}-#{version}")
177
- lp_node.set('name', description)
174
+ lp_node['version'] = "#{last_edited}-#{version}"
175
+ lp_node['name'] = description
178
176
 
179
177
  if timestamp
180
178
  format = '%Y-%m-%dT%H:%M:%S%Z'
181
179
 
182
- lp_node.set('timestamp', Time.now.strftime(format))
180
+ lp_node['timestamp'] = Time.now.strftime(format)
183
181
  else
184
- lp_node.set('timestamp', '*')
182
+ lp_node['timestamp'] = '*'
185
183
  end
186
184
  end
187
185
 
@@ -200,7 +198,7 @@ module Opener
200
198
  formatter.compact = true
201
199
  formatter.write(doc, out)
202
200
 
203
- return out.strip
201
+ out.strip
204
202
  end
205
203
 
206
204
  protected
@@ -212,11 +210,11 @@ module Opener
212
210
  parent_node = parent
213
211
  end
214
212
 
215
- node = Oga::XML::Element.new(:name => tag)
213
+ node = Nokogiri::XML::Element.new(tag, document)
216
214
 
217
- parent_node.children << node
215
+ parent_node.add_child node
218
216
 
219
- return node
217
+ node
220
218
  end
221
219
 
222
220
  ##
@@ -224,7 +222,7 @@ module Opener
224
222
  # @return [Boolean]
225
223
  #
226
224
  def is_kaf?
227
- return !!document.at_xpath('KAF')
225
+ !!document.at_xpath('KAF')
228
226
  end
229
227
 
230
228
  ##
@@ -41,15 +41,18 @@ module Opener
41
41
  end
42
42
 
43
43
  def load_aspects lang:, cache:, **params
44
- url = "#{@url}&language_code=#{lang}&#{params.to_query}"
45
- url += "&if_updated_since=#{cache.from.iso8601}" if cache
44
+ url = "#{@url}&language_code=#{lang}&#{params.to_query}"
45
+ url += "&if_updated_since=#{cache.from.utc.iso8601}" if cache
46
46
  puts "#{lang}: loading aspects from #{url}"
47
47
 
48
48
  lexicons = JSON.parse HTTPClient.new.get(url).body
49
49
  lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
50
50
  mapping = Hash.new{ |hash, key| hash[key] = [] }
51
51
  lexicons.each do |l|
52
- mapping[l.lemma.to_sym] << l.aspect
52
+ mapping[l.lemma.to_sym] << l
53
+ l.variants&.each do |v|
54
+ mapping[v.lemma.to_sym] << l
55
+ end
53
56
  end
54
57
 
55
58
  mapping
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.2'
4
+ VERSION = '3.4.0'
5
5
 
6
6
  end
7
7
  end
@@ -28,7 +28,7 @@ Gem::Specification.new do |gem|
28
28
  gem.add_dependency 'opener-webservice', '~> 2.1'
29
29
  gem.add_dependency 'opener-core', '~> 2.2'
30
30
 
31
- gem.add_dependency 'oga', ['~> 1.0', '>= 1.3.1']
31
+ gem.add_dependency 'nokogiri'
32
32
  gem.add_dependency 'httpclient'
33
33
  gem.add_dependency 'hashie'
34
34
  gem.add_dependency 'activesupport'
metadata CHANGED
@@ -1,170 +1,164 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.2
4
+ version: 3.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-07 00:00:00.000000000 Z
11
+ date: 2021-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
+ name: opener-daemons
14
15
  requirement: !ruby/object:Gem::Requirement
15
16
  requirements:
16
17
  - - "~>"
17
18
  - !ruby/object:Gem::Version
18
19
  version: '2.2'
19
- name: opener-daemons
20
- prerelease: false
21
20
  type: :runtime
21
+ prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.2'
27
27
  - !ruby/object:Gem::Dependency
28
+ name: opener-webservice
28
29
  requirement: !ruby/object:Gem::Requirement
29
30
  requirements:
30
31
  - - "~>"
31
32
  - !ruby/object:Gem::Version
32
33
  version: '2.1'
33
- name: opener-webservice
34
- prerelease: false
35
34
  type: :runtime
35
+ prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '2.1'
41
41
  - !ruby/object:Gem::Dependency
42
+ name: opener-core
42
43
  requirement: !ruby/object:Gem::Requirement
43
44
  requirements:
44
45
  - - "~>"
45
46
  - !ruby/object:Gem::Version
46
47
  version: '2.2'
47
- name: opener-core
48
- prerelease: false
49
48
  type: :runtime
49
+ prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '2.2'
55
55
  - !ruby/object:Gem::Dependency
56
+ name: nokogiri
56
57
  requirement: !ruby/object:Gem::Requirement
57
58
  requirements:
58
- - - "~>"
59
- - !ruby/object:Gem::Version
60
- version: '1.0'
61
59
  - - ">="
62
60
  - !ruby/object:Gem::Version
63
- version: 1.3.1
64
- name: oga
65
- prerelease: false
61
+ version: '0'
66
62
  type: :runtime
63
+ prerelease: false
67
64
  version_requirements: !ruby/object:Gem::Requirement
68
65
  requirements:
69
- - - "~>"
70
- - !ruby/object:Gem::Version
71
- version: '1.0'
72
66
  - - ">="
73
67
  - !ruby/object:Gem::Version
74
- version: 1.3.1
68
+ version: '0'
75
69
  - !ruby/object:Gem::Dependency
70
+ name: httpclient
76
71
  requirement: !ruby/object:Gem::Requirement
77
72
  requirements:
78
73
  - - ">="
79
74
  - !ruby/object:Gem::Version
80
75
  version: '0'
81
- name: httpclient
82
- prerelease: false
83
76
  type: :runtime
77
+ prerelease: false
84
78
  version_requirements: !ruby/object:Gem::Requirement
85
79
  requirements:
86
80
  - - ">="
87
81
  - !ruby/object:Gem::Version
88
82
  version: '0'
89
83
  - !ruby/object:Gem::Dependency
84
+ name: hashie
90
85
  requirement: !ruby/object:Gem::Requirement
91
86
  requirements:
92
87
  - - ">="
93
88
  - !ruby/object:Gem::Version
94
89
  version: '0'
95
- name: hashie
96
- prerelease: false
97
90
  type: :runtime
91
+ prerelease: false
98
92
  version_requirements: !ruby/object:Gem::Requirement
99
93
  requirements:
100
94
  - - ">="
101
95
  - !ruby/object:Gem::Version
102
96
  version: '0'
103
97
  - !ruby/object:Gem::Dependency
98
+ name: activesupport
104
99
  requirement: !ruby/object:Gem::Requirement
105
100
  requirements:
106
101
  - - ">="
107
102
  - !ruby/object:Gem::Version
108
103
  version: '0'
109
- name: activesupport
110
- prerelease: false
111
104
  type: :runtime
105
+ prerelease: false
112
106
  version_requirements: !ruby/object:Gem::Requirement
113
107
  requirements:
114
108
  - - ">="
115
109
  - !ruby/object:Gem::Version
116
110
  version: '0'
117
111
  - !ruby/object:Gem::Dependency
112
+ name: rspec
118
113
  requirement: !ruby/object:Gem::Requirement
119
114
  requirements:
120
115
  - - "~>"
121
116
  - !ruby/object:Gem::Version
122
117
  version: '3.0'
123
- name: rspec
124
- prerelease: false
125
118
  type: :development
119
+ prerelease: false
126
120
  version_requirements: !ruby/object:Gem::Requirement
127
121
  requirements:
128
122
  - - "~>"
129
123
  - !ruby/object:Gem::Version
130
124
  version: '3.0'
131
125
  - !ruby/object:Gem::Dependency
126
+ name: cucumber
132
127
  requirement: !ruby/object:Gem::Requirement
133
128
  requirements:
134
129
  - - ">="
135
130
  - !ruby/object:Gem::Version
136
131
  version: '0'
137
- name: cucumber
138
- prerelease: false
139
132
  type: :development
133
+ prerelease: false
140
134
  version_requirements: !ruby/object:Gem::Requirement
141
135
  requirements:
142
136
  - - ">="
143
137
  - !ruby/object:Gem::Version
144
138
  version: '0'
145
139
  - !ruby/object:Gem::Dependency
140
+ name: rake
146
141
  requirement: !ruby/object:Gem::Requirement
147
142
  requirements:
148
143
  - - ">="
149
144
  - !ruby/object:Gem::Version
150
145
  version: '0'
151
- name: rake
152
- prerelease: false
153
146
  type: :development
147
+ prerelease: false
154
148
  version_requirements: !ruby/object:Gem::Requirement
155
149
  requirements:
156
150
  - - ">="
157
151
  - !ruby/object:Gem::Version
158
152
  version: '0'
159
153
  - !ruby/object:Gem::Dependency
154
+ name: benchmark-ips
160
155
  requirement: !ruby/object:Gem::Requirement
161
156
  requirements:
162
157
  - - "~>"
163
158
  - !ruby/object:Gem::Version
164
159
  version: '2.0'
165
- name: benchmark-ips
166
- prerelease: false
167
160
  type: :development
161
+ prerelease: false
168
162
  version_requirements: !ruby/object:Gem::Requirement
169
163
  requirements:
170
164
  - - "~>"
@@ -219,7 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
219
213
  version: '0'
220
214
  requirements: []
221
215
  rubyforge_project:
222
- rubygems_version: 2.7.9
216
+ rubygems_version: 2.7.8
223
217
  signing_key:
224
218
  specification_version: 4
225
219
  summary: Property tagger for hotels in Dutch and English.