opener-property-tagger 3.3.2 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6483ef2a01a413e280ed9555566499496748cb0b8c25ddd9853fb50165ab03f9
4
- data.tar.gz: eb0f85d3d82aed282d21a3b51a0c75867b62c547b49fd1ceeaac374baff86111
3
+ metadata.gz: 6d4ff307d205b179b0914dc2f4d5278dcf9d345586bd2c1917e76f4dcfe6bb87
4
+ data.tar.gz: 459672172d0eac97a2b00784c63fe1c8375e06e6be07e1049611ee589ed758e9
5
5
  SHA512:
6
- metadata.gz: fe31cb493644de7eaed474d9064e3f8880f229b8e05f0685aae18e730f5cd3d94c92aa403966904147b710f0e9589cffa840cf44ac83d64fc7f9c9211ffc6313
7
- data.tar.gz: '0939965101de5ccd503bbdeae6b511552354d8f285e263b739b5d5f004ffdf03411fac5b4a97404f728bd1038f8320843c2dcb2f8ae70059eedc44e0682c4d71'
6
+ metadata.gz: 69792d8309041f86c67dd36f9c669c579f6891a8304a95db30879ee94c8e19a112ba3f1cc61d83350c8cbb7a4fac130628ac0a47ce4635afe5b79e6a42d11bb7
7
+ data.tar.gz: 7c261405d53473cd1c0785dcd1e7d19851c2491b8970a330cf8b97b3943e297ada8c3ee41d5395d1d2f4de7b40e76f2e58493a7dbdb5200129465e2087e65bb8
@@ -1,6 +1,6 @@
1
1
  require 'open3'
2
2
  require 'slop'
3
- require 'oga'
3
+ require 'nokogiri'
4
4
  require 'monitor'
5
5
  require 'httpclient'
6
6
  require 'hashie'
@@ -33,12 +33,17 @@ module Opener
33
33
  # @param [String] path
34
34
  #
35
35
  def load_aspects(path)
36
- mapping = Hash.new { |hash, key| hash[key] = [] }
36
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
37
37
 
38
- File.foreach(path) do |line|
38
+ File.foreach path do |line|
39
39
  lemma, pos, aspect = line.chomp.split("\t")
40
+ l = Hashie::Mash.new(
41
+ lemma: lemma,
42
+ pos: pos,
43
+ aspect: aspect,
44
+ )
40
45
 
41
- mapping[lemma.to_sym] << aspect
46
+ mapping[l.lemma.to_sym] << l
42
47
  end
43
48
 
44
49
  return mapping
@@ -6,7 +6,8 @@ module Opener
6
6
  class Processor
7
7
 
8
8
  attr_accessor :document
9
- attr_accessor :aspects, :aspects_path, :aspects_url
9
+ attr_accessor :aspects_path, :aspects_url
10
+ attr_accessor :aspects, :lexicons
10
11
  attr_accessor :timestamp, :pretty
11
12
 
12
13
  ##
@@ -25,18 +26,19 @@ module Opener
25
26
  # by default due to the performance overhead.
26
27
  #
27
28
  def initialize file, params: {}, url: nil, path: nil, timestamp: true, pretty: false
28
- @document = Oga.parse_xml file
29
+ @document = Nokogiri.XML file
29
30
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
30
31
  @timestamp = timestamp
31
32
  @pretty = pretty
32
33
 
33
34
  @params = params
34
- @cache_keys = params[:cache_keys] || {lang: language}
35
35
  @remote = !url.nil?
36
36
  @aspects_path = path
37
37
  @aspects_url = url
38
+ @cache_keys = params[:cache_keys] || {}
39
+ @cache_keys.merge! lang: @document.root.attr('xml:lang')
38
40
 
39
- @aspects = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
41
+ @lexicons = if @remote then REMOTE_ASPECTS_CACHE[**@cache_keys].aspects else FILE_ASPECTS_CACHE[aspects_file] end
40
42
  end
41
43
 
42
44
  ##
@@ -44,83 +46,79 @@ module Opener
44
46
  # @return [String]
45
47
  #
46
48
  def process
47
- existing_aspects = extract_aspects
48
-
49
49
  add_features_layer
50
50
  add_properties_layer
51
51
 
52
- existing_aspects.each_with_index do |(key, value), index|
52
+ extract_aspects.each.with_index do |(lemma, values), index|
53
53
  index += 1
54
54
 
55
- add_property(key, value, index)
55
+ add_property lemma, values, index
56
56
  end
57
57
 
58
58
  add_linguistic_processor
59
59
 
60
- return pretty ? pretty_print(document) : document.to_xml
60
+ pretty ? pretty_print(document) : document.to_xml
61
61
  end
62
62
 
63
- ##
64
- # Get the language of the input file.
65
- #
66
- # @return [String]
67
- #
68
63
  def language
69
- return @language ||= document.at_xpath('KAF').get('xml:lang')
64
+ @language ||= document.at_xpath('KAF').attr('xml:lang')
70
65
  end
71
66
 
72
- ##
73
- # Get the terms from the input file
74
- # @return [Hash]
75
- #
76
67
  def terms
77
68
  unless @terms
78
69
  @terms = {}
79
70
 
80
71
  document.xpath('KAF/terms/term').each do |term|
81
- @terms[term.get('tid').to_sym] = term.get('lemma')
72
+ @terms[term.attr('tid').to_sym] = { lemma: term.attr('lemma'), text: term.attr('text')}
82
73
  end
83
74
  end
84
75
 
85
- return @terms
76
+ @terms
86
77
  end
87
78
 
79
+ # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
80
+ # lemmas) belong to a property.
81
+ MAX_NGRAM = 2
82
+
88
83
  ##
89
84
  # Check which terms belong to an aspect (property)
85
+ # Text have priority over Lemmas, overriding if there is a conflict
90
86
  # @return [Hash]
91
87
  #
92
88
  def extract_aspects
93
- term_ids = terms.keys
94
- lemmas = terms.values
95
-
96
- current_token = 0
97
- # Use of n-grams to determine if a unigram (1 lemma) or bigram (2
98
- # lemmas) belong to a property.
99
- max_ngram = 2
100
-
101
- uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
102
-
103
- while current_token < terms.count
104
- (0..max_ngram).each do |tam_ngram|
105
- if current_token + tam_ngram <= terms.count
106
- ngram = lemmas[current_token..current_token+tam_ngram].join(" ").downcase
107
-
108
- if aspects[ngram.to_sym]
109
- properties = aspects[ngram.to_sym]
110
- ids = term_ids[current_token..current_token+tam_ngram]
111
-
112
- properties.uniq.each do |property|
113
- next if !property or property.strip.empty?
114
-
115
- uniq_aspects[property.to_sym] << [ids,ngram]
89
+ all_term_ids = terms.keys
90
+ lemmas = terms.values
91
+ uniq_aspects = Hash.new{ |hash, lemma| hash[lemma] = [] }
92
+
93
+ [:lemma, :text].each do |k|
94
+ current_token = 0
95
+
96
+ while current_token < terms.count
97
+ (0..MAX_NGRAM).each do |tam_ngram|
98
+ next unless current_token + tam_ngram <= terms.count
99
+
100
+ ngram = lemmas[current_token..current_token+tam_ngram].map{ |a| a[k] }.join(" ").downcase
101
+
102
+ @lexicons[ngram.to_sym]&.each do |l|
103
+ properties = if l.aspects.present? then l.aspects else [l.aspect] end
104
+ properties.each do |p|
105
+ next if p.blank?
106
+ term_ids = all_term_ids[current_token..current_token+tam_ngram]
107
+ next if uniq_aspects[p.to_sym].find{ |v| v.term_ids == term_ids }
108
+
109
+ uniq_aspects[p.to_sym] << Hashie::Mash.new(
110
+ term_ids: term_ids,
111
+ ngram: ngram,
112
+ lexicon: l,
113
+ )
116
114
  end
117
115
  end
118
116
  end
117
+ current_token += 1
119
118
  end
120
- current_token += 1
121
119
  end
122
120
 
123
- return Hash[uniq_aspects.sort]
121
+ Hash[uniq_aspects.sort]
124
122
  end
125
123
 
126
124
  ##
@@ -140,25 +138,25 @@ module Opener
140
138
  new_node("properties", "KAF/features")
141
139
  end
142
140
 
143
- def add_property(key, value, index)
141
+ def add_property lemma, values, index
144
142
  property_node = new_node("property", "KAF/features/properties")
145
143
 
146
- property_node.set('lemma', key.to_s)
147
- property_node.set('pid', "p#{index.to_s}")
144
+ property_node['lemma'] = lemma.to_s
145
+ property_node['pid'] = "p#{index.to_s}"
148
146
 
149
147
  references_node = new_node("references", property_node)
150
148
 
151
- value.uniq.each do |v|
152
- comment = Oga::XML::Comment.new(:text => " #{v.last} ")
153
-
154
- references_node.children << comment
149
+ values.each do |v|
150
+ comm_node = Nokogiri::XML::Comment.new(references_node, " #{v.ngram} ")
151
+ references_node.add_child comm_node
155
152
 
156
- span_node = new_node("span", references_node)
153
+ span_node = new_node 'span', references_node
157
154
 
158
- v.first.each do |val|
159
- target_node = new_node("target", span_node)
155
+ v.term_ids.each do |id|
156
+ target_node = new_node 'target', span_node
160
157
 
161
- target_node.set('id', val.to_s)
158
+ target_node['id'] = id.to_s
159
+ target_node['lexicon-id'] = v.lexicon.id if v.lexicon.id
162
160
  end
163
161
  end
164
162
  end
@@ -169,19 +167,19 @@ module Opener
169
167
  version = '2.0'
170
168
 
171
169
  node = new_node('linguisticProcessors', 'KAF/kafHeader')
172
- node.set('layer', 'features')
170
+ node['layer'] = 'features'
173
171
 
174
172
  lp_node = new_node('lp', node)
175
173
 
176
- lp_node.set('version', "#{last_edited}-#{version}")
177
- lp_node.set('name', description)
174
+ lp_node['version'] = "#{last_edited}-#{version}"
175
+ lp_node['name'] = description
178
176
 
179
177
  if timestamp
180
178
  format = '%Y-%m-%dT%H:%M:%S%Z'
181
179
 
182
- lp_node.set('timestamp', Time.now.strftime(format))
180
+ lp_node['timestamp'] = Time.now.strftime(format)
183
181
  else
184
- lp_node.set('timestamp', '*')
182
+ lp_node['timestamp'] = '*'
185
183
  end
186
184
  end
187
185
 
@@ -200,7 +198,7 @@ module Opener
200
198
  formatter.compact = true
201
199
  formatter.write(doc, out)
202
200
 
203
- return out.strip
201
+ out.strip
204
202
  end
205
203
 
206
204
  protected
@@ -212,11 +210,11 @@ module Opener
212
210
  parent_node = parent
213
211
  end
214
212
 
215
- node = Oga::XML::Element.new(:name => tag)
213
+ node = Nokogiri::XML::Element.new(tag, document)
216
214
 
217
- parent_node.children << node
215
+ parent_node.add_child node
218
216
 
219
- return node
217
+ node
220
218
  end
221
219
 
222
220
  ##
@@ -224,7 +222,7 @@ module Opener
224
222
  # @return [Boolean]
225
223
  #
226
224
  def is_kaf?
227
- return !!document.at_xpath('KAF')
225
+ !!document.at_xpath('KAF')
228
226
  end
229
227
 
230
228
  ##
@@ -41,15 +41,18 @@ module Opener
41
41
  end
42
42
 
43
43
  def load_aspects lang:, cache:, **params
44
- url = "#{@url}&language_code=#{lang}&#{params.to_query}"
45
- url += "&if_updated_since=#{cache.from.iso8601}" if cache
44
+ url = "#{@url}&language_code=#{lang}&#{params.to_query}"
45
+ url += "&if_updated_since=#{cache.from.utc.iso8601}" if cache
46
46
  puts "#{lang}: loading aspects from #{url}"
47
47
 
48
48
  lexicons = JSON.parse HTTPClient.new.get(url).body
49
49
  lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
50
50
  mapping = Hash.new{ |hash, key| hash[key] = [] }
51
51
  lexicons.each do |l|
52
- mapping[l.lemma.to_sym] << l.aspect
52
+ mapping[l.lemma.to_sym] << l
53
+ l.variants&.each do |v|
54
+ mapping[v.lemma.to_sym] << l
55
+ end
53
56
  end
54
57
 
55
58
  mapping
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
3
 
4
- VERSION = '3.3.2'
4
+ VERSION = '3.4.0'
5
5
 
6
6
  end
7
7
  end
@@ -28,7 +28,7 @@ Gem::Specification.new do |gem|
28
28
  gem.add_dependency 'opener-webservice', '~> 2.1'
29
29
  gem.add_dependency 'opener-core', '~> 2.2'
30
30
 
31
- gem.add_dependency 'oga', ['~> 1.0', '>= 1.3.1']
31
+ gem.add_dependency 'nokogiri'
32
32
  gem.add_dependency 'httpclient'
33
33
  gem.add_dependency 'hashie'
34
34
  gem.add_dependency 'activesupport'
metadata CHANGED
@@ -1,170 +1,164 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.2
4
+ version: 3.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-07 00:00:00.000000000 Z
11
+ date: 2021-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
+ name: opener-daemons
14
15
  requirement: !ruby/object:Gem::Requirement
15
16
  requirements:
16
17
  - - "~>"
17
18
  - !ruby/object:Gem::Version
18
19
  version: '2.2'
19
- name: opener-daemons
20
- prerelease: false
21
20
  type: :runtime
21
+ prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.2'
27
27
  - !ruby/object:Gem::Dependency
28
+ name: opener-webservice
28
29
  requirement: !ruby/object:Gem::Requirement
29
30
  requirements:
30
31
  - - "~>"
31
32
  - !ruby/object:Gem::Version
32
33
  version: '2.1'
33
- name: opener-webservice
34
- prerelease: false
35
34
  type: :runtime
35
+ prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '2.1'
41
41
  - !ruby/object:Gem::Dependency
42
+ name: opener-core
42
43
  requirement: !ruby/object:Gem::Requirement
43
44
  requirements:
44
45
  - - "~>"
45
46
  - !ruby/object:Gem::Version
46
47
  version: '2.2'
47
- name: opener-core
48
- prerelease: false
49
48
  type: :runtime
49
+ prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '2.2'
55
55
  - !ruby/object:Gem::Dependency
56
+ name: nokogiri
56
57
  requirement: !ruby/object:Gem::Requirement
57
58
  requirements:
58
- - - "~>"
59
- - !ruby/object:Gem::Version
60
- version: '1.0'
61
59
  - - ">="
62
60
  - !ruby/object:Gem::Version
63
- version: 1.3.1
64
- name: oga
65
- prerelease: false
61
+ version: '0'
66
62
  type: :runtime
63
+ prerelease: false
67
64
  version_requirements: !ruby/object:Gem::Requirement
68
65
  requirements:
69
- - - "~>"
70
- - !ruby/object:Gem::Version
71
- version: '1.0'
72
66
  - - ">="
73
67
  - !ruby/object:Gem::Version
74
- version: 1.3.1
68
+ version: '0'
75
69
  - !ruby/object:Gem::Dependency
70
+ name: httpclient
76
71
  requirement: !ruby/object:Gem::Requirement
77
72
  requirements:
78
73
  - - ">="
79
74
  - !ruby/object:Gem::Version
80
75
  version: '0'
81
- name: httpclient
82
- prerelease: false
83
76
  type: :runtime
77
+ prerelease: false
84
78
  version_requirements: !ruby/object:Gem::Requirement
85
79
  requirements:
86
80
  - - ">="
87
81
  - !ruby/object:Gem::Version
88
82
  version: '0'
89
83
  - !ruby/object:Gem::Dependency
84
+ name: hashie
90
85
  requirement: !ruby/object:Gem::Requirement
91
86
  requirements:
92
87
  - - ">="
93
88
  - !ruby/object:Gem::Version
94
89
  version: '0'
95
- name: hashie
96
- prerelease: false
97
90
  type: :runtime
91
+ prerelease: false
98
92
  version_requirements: !ruby/object:Gem::Requirement
99
93
  requirements:
100
94
  - - ">="
101
95
  - !ruby/object:Gem::Version
102
96
  version: '0'
103
97
  - !ruby/object:Gem::Dependency
98
+ name: activesupport
104
99
  requirement: !ruby/object:Gem::Requirement
105
100
  requirements:
106
101
  - - ">="
107
102
  - !ruby/object:Gem::Version
108
103
  version: '0'
109
- name: activesupport
110
- prerelease: false
111
104
  type: :runtime
105
+ prerelease: false
112
106
  version_requirements: !ruby/object:Gem::Requirement
113
107
  requirements:
114
108
  - - ">="
115
109
  - !ruby/object:Gem::Version
116
110
  version: '0'
117
111
  - !ruby/object:Gem::Dependency
112
+ name: rspec
118
113
  requirement: !ruby/object:Gem::Requirement
119
114
  requirements:
120
115
  - - "~>"
121
116
  - !ruby/object:Gem::Version
122
117
  version: '3.0'
123
- name: rspec
124
- prerelease: false
125
118
  type: :development
119
+ prerelease: false
126
120
  version_requirements: !ruby/object:Gem::Requirement
127
121
  requirements:
128
122
  - - "~>"
129
123
  - !ruby/object:Gem::Version
130
124
  version: '3.0'
131
125
  - !ruby/object:Gem::Dependency
126
+ name: cucumber
132
127
  requirement: !ruby/object:Gem::Requirement
133
128
  requirements:
134
129
  - - ">="
135
130
  - !ruby/object:Gem::Version
136
131
  version: '0'
137
- name: cucumber
138
- prerelease: false
139
132
  type: :development
133
+ prerelease: false
140
134
  version_requirements: !ruby/object:Gem::Requirement
141
135
  requirements:
142
136
  - - ">="
143
137
  - !ruby/object:Gem::Version
144
138
  version: '0'
145
139
  - !ruby/object:Gem::Dependency
140
+ name: rake
146
141
  requirement: !ruby/object:Gem::Requirement
147
142
  requirements:
148
143
  - - ">="
149
144
  - !ruby/object:Gem::Version
150
145
  version: '0'
151
- name: rake
152
- prerelease: false
153
146
  type: :development
147
+ prerelease: false
154
148
  version_requirements: !ruby/object:Gem::Requirement
155
149
  requirements:
156
150
  - - ">="
157
151
  - !ruby/object:Gem::Version
158
152
  version: '0'
159
153
  - !ruby/object:Gem::Dependency
154
+ name: benchmark-ips
160
155
  requirement: !ruby/object:Gem::Requirement
161
156
  requirements:
162
157
  - - "~>"
163
158
  - !ruby/object:Gem::Version
164
159
  version: '2.0'
165
- name: benchmark-ips
166
- prerelease: false
167
160
  type: :development
161
+ prerelease: false
168
162
  version_requirements: !ruby/object:Gem::Requirement
169
163
  requirements:
170
164
  - - "~>"
@@ -219,7 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
219
213
  version: '0'
220
214
  requirements: []
221
215
  rubyforge_project:
222
- rubygems_version: 2.7.9
216
+ rubygems_version: 2.7.8
223
217
  signing_key:
224
218
  specification_version: 4
225
219
  summary: Property tagger for hotels in Dutch and English.