opener-property-tagger 3.0.5 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: db4ce8acc0c1dd88b2042f9a936012ea9a3c1448
4
- data.tar.gz: 4ed777c8b215fb98dd3687689cb1c58d97787dd0
2
+ SHA256:
3
+ metadata.gz: 168a0501dc6567285dd70b1a304fdc885b6d9493f9ca62863a428068b42d284e
4
+ data.tar.gz: 6e9dd2b446eca6e75d4644e5cb59ec8ad64ec6f7f88e79266142129ce5ee59eb
5
5
  SHA512:
6
- metadata.gz: 5cd57abe32b4d88f8ed10618c9e8466c1628faebda89ad1ceeaa956c571fdc00a6069cea44a1a1570df2ebe0136fc6e34c142d1850972a1bc3e2f926c640d13d
7
- data.tar.gz: e64b80100aac3dbfc3229e75692ef9ab16cbd362ca878e36f2ef5b492c456843fcb535c5f26105f6e334594a433c7ff9b050758068779803373aab76acac5e96
6
+ metadata.gz: 7a209b8da679c06ea5fce64a11142b22b8b7a3c4da2775f832f4266af152c624f8ddb9e81bf7cbb96d668babc303fedb1243988b243faa2c72a5b7438c47c9b6
7
+ data.tar.gz: 621b9a11a6c2230aee345fc4d072b8636d1c63b6533960103de475355a54baac7924867eb59176850d7b0aaaccdbc1a4a6eccf9af62652bbaf6f26097163778d
@@ -1,12 +1,18 @@
1
1
  require 'open3'
2
2
  require 'slop'
3
3
  require 'oga'
4
+ require 'monitor'
5
+ require 'httpclient'
6
+ require 'hashie'
7
+ require 'json'
4
8
 
5
9
  require 'rexml/document'
6
10
  require 'rexml/formatters/pretty'
7
11
 
8
12
  require_relative 'property_tagger/version'
9
13
  require_relative 'property_tagger/cli'
14
+ require_relative 'property_tagger/aspects_cache'
15
+ require_relative 'property_tagger/remote_aspects_cache'
10
16
  require_relative 'property_tagger/processor'
11
17
 
12
18
  module Opener
@@ -41,36 +47,36 @@ module Opener
41
47
  # @return [String]
42
48
  #
43
49
  def path
44
- path = options[:resource_path] || ENV['RESOURCE_PATH'] ||
50
+ return @path if @path
51
+
52
+ @path = options[:resource_path] || ENV['RESOURCE_PATH'] ||
45
53
  ENV['PROPERTY_TAGGER_LEXICONS_PATH']
54
+ return unless @path
46
55
 
47
- unless path
48
- raise ArgumentError, 'No lexicon path provided'
49
- end
56
+ @path = File.expand_path @path
57
+ end
50
58
 
51
- return File.expand_path(path)
59
+ def remote_url
60
+ @remote_url ||= ENV['PROPERTY_TAGGER_LEXICONS_URL']
52
61
  end
53
62
 
54
63
  ##
55
- # Processes the input and returns an Array containing the output of STDOUT,
56
- # STDERR and an object containing process information.
64
+ # Processes the input KAF document.
57
65
  #
58
- # @param [String] input The text of which to detect the language.
59
- # @return [Array]
66
+ # @param [String] input
67
+ # @return [String]
60
68
  #
61
- def run(input)
62
- output = process(input)
69
+ def run input
70
+ timestamp = !options[:no_time]
63
71
 
64
- return output
72
+ Processor.new(input,
73
+ url: remote_url,
74
+ path: path,
75
+ timestamp: timestamp,
76
+ pretty: options[:pretty],
77
+ ).process
65
78
  end
66
79
 
67
- protected
68
-
69
- def process(input)
70
- processor = Processor.new(input, path, !options[:no_time])
71
-
72
- return processor.process
73
- end
74
- end # PolarityTagger
75
- end # Opener
80
+ end
81
+ end
76
82
 
@@ -0,0 +1,47 @@
1
+ module Opener
2
+ class PropertyTagger
3
+ ##
4
+ # Thread-safe cache for storing the contents of aspect files.
5
+ #
6
+ class AspectsCache
7
+ include MonitorMixin
8
+
9
+ def initialize
10
+ super
11
+
12
+ @cache = {}
13
+ end
14
+
15
+ ##
16
+ # Returns the aspects for the given file path. If the aspects don't exist
17
+ # they are first loaded into the cache.
18
+ #
19
+ # @param [String] path
20
+ #
21
+ def [](path)
22
+ synchronize do
23
+ @cache[path] = load_aspects(path) unless @cache.key?(path)
24
+ end
25
+ end
26
+
27
+ alias_method :get, :[]
28
+
29
+ ##
30
+ # Loads the aspects of the given path.
31
+ #
32
+ # @param [String] path
33
+ #
34
+ def load_aspects(path)
35
+ mapping = Hash.new { |hash, key| hash[key] = [] }
36
+
37
+ File.foreach(path) do |line|
38
+ lemma, pos, aspect = line.chomp.split("\t")
39
+
40
+ mapping[lemma.to_sym] << aspect
41
+ end
42
+
43
+ return mapping
44
+ end
45
+ end # AspectsCache
46
+ end # PropertyTagger
47
+ end # Opener
@@ -56,10 +56,13 @@ Examples:
56
56
 
57
57
  on :'no-time', 'Disables adding of timestamps'
58
58
 
59
+ on :ugly, 'Disables pretty formatting of XML (faster)'
60
+
59
61
  run do |opts, args|
60
62
  tagger = PropertyTagger.new(
61
63
  :args => args,
62
- :no_time => opts[:'no-time']
64
+ :no_time => opts[:'no-time'],
65
+ :pretty => !opts[:ugly]
63
66
  )
64
67
 
65
68
  input = STDIN.tty? ? nil : STDIN.read
@@ -4,15 +4,37 @@ module Opener
4
4
  # Class that applies property tagging to a given input KAF file.
5
5
  #
6
6
  class Processor
7
- attr_accessor :document, :aspects_path, :language, :aspects, :terms,
8
- :timestamp
9
7
 
10
- def initialize(file, aspects_path, timestamp = true)
11
- @document = Oga.parse_xml(file)
12
- @aspects_path = aspects_path
13
- @timestamp = timestamp
8
+ attr_accessor :document
9
+ attr_accessor :aspects, :aspects_path, :aspects_url
10
+ attr_accessor :timestamp, :pretty
14
11
 
12
+ ##
13
+ # Global cache used for storing loaded aspects.
14
+ #
15
+ # @return [Opener::PropertyTagger::AspectsCache.new]
16
+ #
17
+ ASPECTS_CACHE = AspectsCache.new
18
+ REMOTE_ASPECTS_CACHE = RemoteAspectsCache.new
19
+
20
+ ##
21
+ # @param [String|IO] file The KAF file/input to process.
22
+ # @param [String] aspects_path Path to the aspects.
23
+ # @param [TrueClass|FalseClass] timestamp Add timestamps to the KAF.
24
+ # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
25
+ # by default due to the performance overhead.
26
+ #
27
+ def initialize file, url: nil, path: nil, timestamp: true, pretty: false
28
+ @document = Oga.parse_xml file
15
29
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
30
+ @timestamp = timestamp
31
+ @pretty = pretty
32
+
33
+ @remote = !url.nil?
34
+ @aspects_path = path
35
+ @aspects_url = url
36
+
37
+ @aspects = if @remote then REMOTE_ASPECTS_CACHE[language] else ASPECTS_CACHE[aspects_file] end
16
38
  end
17
39
 
18
40
  ##
@@ -20,64 +42,45 @@ module Opener
20
42
  # @return [String]
21
43
  #
22
44
  def process
23
- @language = get_language
24
- @aspects = load_aspects
25
- @terms = get_terms
26
-
27
45
  existing_aspects = extract_aspects
28
46
 
29
47
  add_features_layer
30
48
  add_properties_layer
31
49
 
32
- index = 1
50
+ existing_aspects.each_with_index do |(key, value), index|
51
+ index += 1
33
52
 
34
- existing_aspects.each_pair do |key,value|
35
53
  add_property(key, value, index)
36
- index += 1
37
54
  end
38
55
 
39
56
  add_linguistic_processor
40
57
 
41
- return pretty_print(document)
42
- end
43
-
44
- ##
45
- # Loads the aspects from the txt file
46
- # @return [Hash]
47
- #
48
- def load_aspects
49
- aspects_hash = {}
50
-
51
- File.foreach(aspects_file) do |line|
52
- lemma, pos, aspect = line.gsub("\n", "").split("\t")
53
-
54
- aspects_hash[lemma.to_sym] = [] unless aspects_hash[lemma.to_sym]
55
- aspects_hash[lemma.to_sym] << aspect
56
- end
57
-
58
- return aspects_hash
58
+ return pretty ? pretty_print(document) : document.to_xml
59
59
  end
60
60
 
61
61
  ##
62
62
  # Get the language of the input file.
63
+ #
63
64
  # @return [String]
64
65
  #
65
- def get_language
66
- document.at_xpath('KAF').get('xml:lang')
66
+ def language
67
+ return @language ||= document.at_xpath('KAF').get('xml:lang')
67
68
  end
68
69
 
69
70
  ##
70
71
  # Get the terms from the input file
71
72
  # @return [Hash]
72
73
  #
73
- def get_terms
74
- terms_hash = {}
74
+ def terms
75
+ unless @terms
76
+ @terms = {}
75
77
 
76
- document.xpath('KAF/terms/term').each do |term|
77
- terms_hash[term.get('tid').to_sym] = term.get('lemma')
78
+ document.xpath('KAF/terms/term').each do |term|
79
+ @terms[term.get('tid').to_sym] = term.get('lemma')
80
+ end
78
81
  end
79
82
 
80
- return terms_hash
83
+ return @terms
81
84
  end
82
85
 
83
86
  ##
@@ -93,7 +96,7 @@ module Opener
93
96
  # lemmas) belong to a property.
94
97
  max_ngram = 2
95
98
 
96
- uniq_aspects = {}
99
+ uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
97
100
 
98
101
  while current_token < terms.count
99
102
  (0..max_ngram).each do |tam_ngram|
@@ -107,7 +110,6 @@ module Opener
107
110
  properties.uniq.each do |property|
108
111
  next if !property or property.strip.empty?
109
112
 
110
- uniq_aspects[property.to_sym] = [] unless uniq_aspects[property.to_sym]
111
113
  uniq_aspects[property.to_sym] << [ids,ngram]
112
114
  end
113
115
  end
@@ -223,9 +225,13 @@ module Opener
223
225
  return !!document.at_xpath('KAF')
224
226
  end
225
227
 
228
+ ##
229
+ # @return [String]
230
+ #
226
231
  def aspects_file
227
- return File.expand_path("#{aspects_path}/#{language}.txt", __FILE__)
232
+ @aspects_file ||= File.expand_path "#{aspects_path}/#{language}.txt", __FILE__
228
233
  end
229
- end # Processor
230
- end # PropertyTagger
231
- end # Opener
234
+
235
+ end
236
+ end
237
+ end
@@ -0,0 +1,40 @@
1
+ module Opener
2
+ class PropertyTagger
3
+ ##
4
+ # Thread-safe cache for storing the contents of remote aspects.
5
+ #
6
+ class RemoteAspectsCache
7
+
8
+ include MonitorMixin
9
+
10
+ def initialize
11
+ super
12
+
13
+ @url = ENV['PROPERTY_TAGGER_LEXICONS_URL']
14
+ @cache = {}
15
+ end
16
+
17
+ def [] lang
18
+ synchronize do
19
+ @cache[lang] ||= load_aspects lang
20
+ end
21
+ end
22
+ alias_method :get, :[]
23
+
24
+ def load_aspects lang
25
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
26
+ url = "#{@url}&language_code=#{lang}"
27
+ lexicons = JSON.parse HTTPClient.new.get(url).body
28
+ lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
29
+ puts "#{lang}: loaded aspects from #{url}"
30
+
31
+ lexicons.each do |l|
32
+ mapping[l.lemma.to_sym] << l.aspect
33
+ end
34
+
35
+ return mapping
36
+ end
37
+
38
+ end
39
+ end
40
+ end
@@ -1,5 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
- VERSION = '3.0.5'
4
- end # PropertyTagger
5
- end # Opener
3
+
4
+ VERSION = '3.2.1'
5
+
6
+ end
7
+ end
@@ -28,9 +28,12 @@ Gem::Specification.new do |gem|
28
28
  gem.add_dependency 'opener-webservice', '~> 2.1'
29
29
  gem.add_dependency 'opener-core', '~> 2.2'
30
30
 
31
- gem.add_dependency 'oga'
31
+ gem.add_dependency 'oga', ['~> 1.0', '>= 1.3.1']
32
+ gem.add_dependency 'httpclient'
33
+ gem.add_dependency 'hashie'
32
34
 
33
35
  gem.add_development_dependency 'rspec', '~> 3.0'
34
36
  gem.add_development_dependency 'cucumber'
35
37
  gem.add_development_dependency 'rake'
38
+ gem.add_development_dependency 'benchmark-ips', '~> 2.0'
36
39
  end
@@ -2,5 +2,6 @@ desc 'Runs the tests'
2
2
  task :test => [:lexicons] do
3
3
  ENV['RESOURCE_PATH'] = File.expand_path('../../tmp/lexicons/hotel', __FILE__)
4
4
 
5
- sh('cucumber features')
5
+ sh 'cucumber features'
6
+ sh 'rspec spec --order random'
6
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.5
4
+ version: 3.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-26 00:00:00.000000000 Z
11
+ date: 2020-09-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons
@@ -54,6 +54,40 @@ dependencies:
54
54
  version: '2.2'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: oga
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 1.3.1
65
+ type: :runtime
66
+ prerelease: false
67
+ version_requirements: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - "~>"
70
+ - !ruby/object:Gem::Version
71
+ version: '1.0'
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 1.3.1
75
+ - !ruby/object:Gem::Dependency
76
+ name: httpclient
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
82
+ type: :runtime
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ - !ruby/object:Gem::Dependency
90
+ name: hashie
57
91
  requirement: !ruby/object:Gem::Requirement
58
92
  requirements:
59
93
  - - ">="
@@ -108,8 +142,22 @@ dependencies:
108
142
  - - ">="
109
143
  - !ruby/object:Gem::Version
110
144
  version: '0'
145
+ - !ruby/object:Gem::Dependency
146
+ name: benchmark-ips
147
+ requirement: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - "~>"
150
+ - !ruby/object:Gem::Version
151
+ version: '2.0'
152
+ type: :development
153
+ prerelease: false
154
+ version_requirements: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - "~>"
157
+ - !ruby/object:Gem::Version
158
+ version: '2.0'
111
159
  description: Property tagger for hotels in Dutch and English.
112
- email:
160
+ email:
113
161
  executables:
114
162
  - property-tagger
115
163
  - property-tagger-daemon
@@ -125,9 +173,11 @@ files:
125
173
  - config.ru
126
174
  - exec/property-tagger.rb
127
175
  - lib/opener/property_tagger.rb
176
+ - lib/opener/property_tagger/aspects_cache.rb
128
177
  - lib/opener/property_tagger/cli.rb
129
178
  - lib/opener/property_tagger/processor.rb
130
179
  - lib/opener/property_tagger/public/markdown.css
180
+ - lib/opener/property_tagger/remote_aspects_cache.rb
131
181
  - lib/opener/property_tagger/server.rb
132
182
  - lib/opener/property_tagger/version.rb
133
183
  - lib/opener/property_tagger/views/index.erb
@@ -139,7 +189,7 @@ homepage: http://opener-project.github.com/
139
189
  licenses:
140
190
  - Apache 2.0
141
191
  metadata: {}
142
- post_install_message:
192
+ post_install_message:
143
193
  rdoc_options: []
144
194
  require_paths:
145
195
  - lib
@@ -154,10 +204,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
204
  - !ruby/object:Gem::Version
155
205
  version: '0'
156
206
  requirements: []
157
- rubyforge_project:
158
- rubygems_version: 2.2.2
159
- signing_key:
207
+ rubyforge_project:
208
+ rubygems_version: 2.7.8
209
+ signing_key:
160
210
  specification_version: 4
161
211
  summary: Property tagger for hotels in Dutch and English.
162
212
  test_files: []
163
- has_rdoc: