opener-property-tagger 3.0.5 → 3.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: db4ce8acc0c1dd88b2042f9a936012ea9a3c1448
4
- data.tar.gz: 4ed777c8b215fb98dd3687689cb1c58d97787dd0
2
+ SHA256:
3
+ metadata.gz: 168a0501dc6567285dd70b1a304fdc885b6d9493f9ca62863a428068b42d284e
4
+ data.tar.gz: 6e9dd2b446eca6e75d4644e5cb59ec8ad64ec6f7f88e79266142129ce5ee59eb
5
5
  SHA512:
6
- metadata.gz: 5cd57abe32b4d88f8ed10618c9e8466c1628faebda89ad1ceeaa956c571fdc00a6069cea44a1a1570df2ebe0136fc6e34c142d1850972a1bc3e2f926c640d13d
7
- data.tar.gz: e64b80100aac3dbfc3229e75692ef9ab16cbd362ca878e36f2ef5b492c456843fcb535c5f26105f6e334594a433c7ff9b050758068779803373aab76acac5e96
6
+ metadata.gz: 7a209b8da679c06ea5fce64a11142b22b8b7a3c4da2775f832f4266af152c624f8ddb9e81bf7cbb96d668babc303fedb1243988b243faa2c72a5b7438c47c9b6
7
+ data.tar.gz: 621b9a11a6c2230aee345fc4d072b8636d1c63b6533960103de475355a54baac7924867eb59176850d7b0aaaccdbc1a4a6eccf9af62652bbaf6f26097163778d
@@ -1,12 +1,18 @@
1
1
  require 'open3'
2
2
  require 'slop'
3
3
  require 'oga'
4
+ require 'monitor'
5
+ require 'httpclient'
6
+ require 'hashie'
7
+ require 'json'
4
8
 
5
9
  require 'rexml/document'
6
10
  require 'rexml/formatters/pretty'
7
11
 
8
12
  require_relative 'property_tagger/version'
9
13
  require_relative 'property_tagger/cli'
14
+ require_relative 'property_tagger/aspects_cache'
15
+ require_relative 'property_tagger/remote_aspects_cache'
10
16
  require_relative 'property_tagger/processor'
11
17
 
12
18
  module Opener
@@ -41,36 +47,36 @@ module Opener
41
47
  # @return [String]
42
48
  #
43
49
  def path
44
- path = options[:resource_path] || ENV['RESOURCE_PATH'] ||
50
+ return @path if @path
51
+
52
+ @path = options[:resource_path] || ENV['RESOURCE_PATH'] ||
45
53
  ENV['PROPERTY_TAGGER_LEXICONS_PATH']
54
+ return unless @path
46
55
 
47
- unless path
48
- raise ArgumentError, 'No lexicon path provided'
49
- end
56
+ @path = File.expand_path @path
57
+ end
50
58
 
51
- return File.expand_path(path)
59
+ def remote_url
60
+ @remote_url ||= ENV['PROPERTY_TAGGER_LEXICONS_URL']
52
61
  end
53
62
 
54
63
  ##
55
- # Processes the input and returns an Array containing the output of STDOUT,
56
- # STDERR and an object containing process information.
64
+ # Processes the input KAF document.
57
65
  #
58
- # @param [String] input The text of which to detect the language.
59
- # @return [Array]
66
+ # @param [String] input
67
+ # @return [String]
60
68
  #
61
- def run(input)
62
- output = process(input)
69
+ def run input
70
+ timestamp = !options[:no_time]
63
71
 
64
- return output
72
+ Processor.new(input,
73
+ url: remote_url,
74
+ path: path,
75
+ timestamp: timestamp,
76
+ pretty: options[:pretty],
77
+ ).process
65
78
  end
66
79
 
67
- protected
68
-
69
- def process(input)
70
- processor = Processor.new(input, path, !options[:no_time])
71
-
72
- return processor.process
73
- end
74
- end # PolarityTagger
75
- end # Opener
80
+ end
81
+ end
76
82
 
@@ -0,0 +1,47 @@
1
+ module Opener
2
+ class PropertyTagger
3
+ ##
4
+ # Thread-safe cache for storing the contents of aspect files.
5
+ #
6
+ class AspectsCache
7
+ include MonitorMixin
8
+
9
+ def initialize
10
+ super
11
+
12
+ @cache = {}
13
+ end
14
+
15
+ ##
16
+ # Returns the aspects for the given file path. If the aspects don't exist
17
+ # they are first loaded into the cache.
18
+ #
19
+ # @param [String] path
20
+ #
21
+ def [](path)
22
+ synchronize do
23
+ @cache[path] = load_aspects(path) unless @cache.key?(path)
24
+ end
25
+ end
26
+
27
+ alias_method :get, :[]
28
+
29
+ ##
30
+ # Loads the aspects of the given path.
31
+ #
32
+ # @param [String] path
33
+ #
34
+ def load_aspects(path)
35
+ mapping = Hash.new { |hash, key| hash[key] = [] }
36
+
37
+ File.foreach(path) do |line|
38
+ lemma, pos, aspect = line.chomp.split("\t")
39
+
40
+ mapping[lemma.to_sym] << aspect
41
+ end
42
+
43
+ return mapping
44
+ end
45
+ end # AspectsCache
46
+ end # PropertyTagger
47
+ end # Opener
@@ -56,10 +56,13 @@ Examples:
56
56
 
57
57
  on :'no-time', 'Disables adding of timestamps'
58
58
 
59
+ on :ugly, 'Disables pretty formatting of XML (faster)'
60
+
59
61
  run do |opts, args|
60
62
  tagger = PropertyTagger.new(
61
63
  :args => args,
62
- :no_time => opts[:'no-time']
64
+ :no_time => opts[:'no-time'],
65
+ :pretty => !opts[:ugly]
63
66
  )
64
67
 
65
68
  input = STDIN.tty? ? nil : STDIN.read
@@ -4,15 +4,37 @@ module Opener
4
4
  # Class that applies property tagging to a given input KAF file.
5
5
  #
6
6
  class Processor
7
- attr_accessor :document, :aspects_path, :language, :aspects, :terms,
8
- :timestamp
9
7
 
10
- def initialize(file, aspects_path, timestamp = true)
11
- @document = Oga.parse_xml(file)
12
- @aspects_path = aspects_path
13
- @timestamp = timestamp
8
+ attr_accessor :document
9
+ attr_accessor :aspects, :aspects_path, :aspects_url
10
+ attr_accessor :timestamp, :pretty
14
11
 
12
+ ##
13
+ # Global cache used for storing loaded aspects.
14
+ #
15
+ # @return [Opener::PropertyTagger::AspectsCache.new]
16
+ #
17
+ ASPECTS_CACHE = AspectsCache.new
18
+ REMOTE_ASPECTS_CACHE = RemoteAspectsCache.new
19
+
20
+ ##
21
+ # @param [String|IO] file The KAF file/input to process.
22
+ # @param [String] aspects_path Path to the aspects.
23
+ # @param [TrueClass|FalseClass] timestamp Add timestamps to the KAF.
24
+ # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
25
+ # by default due to the performance overhead.
26
+ #
27
+ def initialize file, url: nil, path: nil, timestamp: true, pretty: false
28
+ @document = Oga.parse_xml file
15
29
  raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
30
+ @timestamp = timestamp
31
+ @pretty = pretty
32
+
33
+ @remote = !url.nil?
34
+ @aspects_path = path
35
+ @aspects_url = url
36
+
37
+ @aspects = if @remote then REMOTE_ASPECTS_CACHE[language] else ASPECTS_CACHE[aspects_file] end
16
38
  end
17
39
 
18
40
  ##
@@ -20,64 +42,45 @@ module Opener
20
42
  # @return [String]
21
43
  #
22
44
  def process
23
- @language = get_language
24
- @aspects = load_aspects
25
- @terms = get_terms
26
-
27
45
  existing_aspects = extract_aspects
28
46
 
29
47
  add_features_layer
30
48
  add_properties_layer
31
49
 
32
- index = 1
50
+ existing_aspects.each_with_index do |(key, value), index|
51
+ index += 1
33
52
 
34
- existing_aspects.each_pair do |key,value|
35
53
  add_property(key, value, index)
36
- index += 1
37
54
  end
38
55
 
39
56
  add_linguistic_processor
40
57
 
41
- return pretty_print(document)
42
- end
43
-
44
- ##
45
- # Loads the aspects from the txt file
46
- # @return [Hash]
47
- #
48
- def load_aspects
49
- aspects_hash = {}
50
-
51
- File.foreach(aspects_file) do |line|
52
- lemma, pos, aspect = line.gsub("\n", "").split("\t")
53
-
54
- aspects_hash[lemma.to_sym] = [] unless aspects_hash[lemma.to_sym]
55
- aspects_hash[lemma.to_sym] << aspect
56
- end
57
-
58
- return aspects_hash
58
+ return pretty ? pretty_print(document) : document.to_xml
59
59
  end
60
60
 
61
61
  ##
62
62
  # Get the language of the input file.
63
+ #
63
64
  # @return [String]
64
65
  #
65
- def get_language
66
- document.at_xpath('KAF').get('xml:lang')
66
+ def language
67
+ return @language ||= document.at_xpath('KAF').get('xml:lang')
67
68
  end
68
69
 
69
70
  ##
70
71
  # Get the terms from the input file
71
72
  # @return [Hash]
72
73
  #
73
- def get_terms
74
- terms_hash = {}
74
+ def terms
75
+ unless @terms
76
+ @terms = {}
75
77
 
76
- document.xpath('KAF/terms/term').each do |term|
77
- terms_hash[term.get('tid').to_sym] = term.get('lemma')
78
+ document.xpath('KAF/terms/term').each do |term|
79
+ @terms[term.get('tid').to_sym] = term.get('lemma')
80
+ end
78
81
  end
79
82
 
80
- return terms_hash
83
+ return @terms
81
84
  end
82
85
 
83
86
  ##
@@ -93,7 +96,7 @@ module Opener
93
96
  # lemmas) belong to a property.
94
97
  max_ngram = 2
95
98
 
96
- uniq_aspects = {}
99
+ uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
97
100
 
98
101
  while current_token < terms.count
99
102
  (0..max_ngram).each do |tam_ngram|
@@ -107,7 +110,6 @@ module Opener
107
110
  properties.uniq.each do |property|
108
111
  next if !property or property.strip.empty?
109
112
 
110
- uniq_aspects[property.to_sym] = [] unless uniq_aspects[property.to_sym]
111
113
  uniq_aspects[property.to_sym] << [ids,ngram]
112
114
  end
113
115
  end
@@ -223,9 +225,13 @@ module Opener
223
225
  return !!document.at_xpath('KAF')
224
226
  end
225
227
 
228
+ ##
229
+ # @return [String]
230
+ #
226
231
  def aspects_file
227
- return File.expand_path("#{aspects_path}/#{language}.txt", __FILE__)
232
+ @aspects_file ||= File.expand_path "#{aspects_path}/#{language}.txt", __FILE__
228
233
  end
229
- end # Processor
230
- end # PropertyTagger
231
- end # Opener
234
+
235
+ end
236
+ end
237
+ end
@@ -0,0 +1,40 @@
1
+ module Opener
2
+ class PropertyTagger
3
+ ##
4
+ # Thread-safe cache for storing the contents of remote aspects.
5
+ #
6
+ class RemoteAspectsCache
7
+
8
+ include MonitorMixin
9
+
10
+ def initialize
11
+ super
12
+
13
+ @url = ENV['PROPERTY_TAGGER_LEXICONS_URL']
14
+ @cache = {}
15
+ end
16
+
17
+ def [] lang
18
+ synchronize do
19
+ @cache[lang] ||= load_aspects lang
20
+ end
21
+ end
22
+ alias_method :get, :[]
23
+
24
+ def load_aspects lang
25
+ mapping = Hash.new{ |hash, key| hash[key] = [] }
26
+ url = "#{@url}&language_code=#{lang}"
27
+ lexicons = JSON.parse HTTPClient.new.get(url).body
28
+ lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
29
+ puts "#{lang}: loaded aspects from #{url}"
30
+
31
+ lexicons.each do |l|
32
+ mapping[l.lemma.to_sym] << l.aspect
33
+ end
34
+
35
+ return mapping
36
+ end
37
+
38
+ end
39
+ end
40
+ end
@@ -1,5 +1,7 @@
1
1
  module Opener
2
2
  class PropertyTagger
3
- VERSION = '3.0.5'
4
- end # PropertyTagger
5
- end # Opener
3
+
4
+ VERSION = '3.2.1'
5
+
6
+ end
7
+ end
@@ -28,9 +28,12 @@ Gem::Specification.new do |gem|
28
28
  gem.add_dependency 'opener-webservice', '~> 2.1'
29
29
  gem.add_dependency 'opener-core', '~> 2.2'
30
30
 
31
- gem.add_dependency 'oga'
31
+ gem.add_dependency 'oga', ['~> 1.0', '>= 1.3.1']
32
+ gem.add_dependency 'httpclient'
33
+ gem.add_dependency 'hashie'
32
34
 
33
35
  gem.add_development_dependency 'rspec', '~> 3.0'
34
36
  gem.add_development_dependency 'cucumber'
35
37
  gem.add_development_dependency 'rake'
38
+ gem.add_development_dependency 'benchmark-ips', '~> 2.0'
36
39
  end
@@ -2,5 +2,6 @@ desc 'Runs the tests'
2
2
  task :test => [:lexicons] do
3
3
  ENV['RESOURCE_PATH'] = File.expand_path('../../tmp/lexicons/hotel', __FILE__)
4
4
 
5
- sh('cucumber features')
5
+ sh 'cucumber features'
6
+ sh 'rspec spec --order random'
6
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-property-tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.5
4
+ version: 3.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-26 00:00:00.000000000 Z
11
+ date: 2020-09-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons
@@ -54,6 +54,40 @@ dependencies:
54
54
  version: '2.2'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: oga
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 1.3.1
65
+ type: :runtime
66
+ prerelease: false
67
+ version_requirements: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - "~>"
70
+ - !ruby/object:Gem::Version
71
+ version: '1.0'
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 1.3.1
75
+ - !ruby/object:Gem::Dependency
76
+ name: httpclient
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
82
+ type: :runtime
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ - !ruby/object:Gem::Dependency
90
+ name: hashie
57
91
  requirement: !ruby/object:Gem::Requirement
58
92
  requirements:
59
93
  - - ">="
@@ -108,8 +142,22 @@ dependencies:
108
142
  - - ">="
109
143
  - !ruby/object:Gem::Version
110
144
  version: '0'
145
+ - !ruby/object:Gem::Dependency
146
+ name: benchmark-ips
147
+ requirement: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - "~>"
150
+ - !ruby/object:Gem::Version
151
+ version: '2.0'
152
+ type: :development
153
+ prerelease: false
154
+ version_requirements: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - "~>"
157
+ - !ruby/object:Gem::Version
158
+ version: '2.0'
111
159
  description: Property tagger for hotels in Dutch and English.
112
- email:
160
+ email:
113
161
  executables:
114
162
  - property-tagger
115
163
  - property-tagger-daemon
@@ -125,9 +173,11 @@ files:
125
173
  - config.ru
126
174
  - exec/property-tagger.rb
127
175
  - lib/opener/property_tagger.rb
176
+ - lib/opener/property_tagger/aspects_cache.rb
128
177
  - lib/opener/property_tagger/cli.rb
129
178
  - lib/opener/property_tagger/processor.rb
130
179
  - lib/opener/property_tagger/public/markdown.css
180
+ - lib/opener/property_tagger/remote_aspects_cache.rb
131
181
  - lib/opener/property_tagger/server.rb
132
182
  - lib/opener/property_tagger/version.rb
133
183
  - lib/opener/property_tagger/views/index.erb
@@ -139,7 +189,7 @@ homepage: http://opener-project.github.com/
139
189
  licenses:
140
190
  - Apache 2.0
141
191
  metadata: {}
142
- post_install_message:
192
+ post_install_message:
143
193
  rdoc_options: []
144
194
  require_paths:
145
195
  - lib
@@ -154,10 +204,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
204
  - !ruby/object:Gem::Version
155
205
  version: '0'
156
206
  requirements: []
157
- rubyforge_project:
158
- rubygems_version: 2.2.2
159
- signing_key:
207
+ rubyforge_project:
208
+ rubygems_version: 2.7.8
209
+ signing_key:
160
210
  specification_version: 4
161
211
  summary: Property tagger for hotels in Dutch and English.
162
212
  test_files: []
163
- has_rdoc: