opener-property-tagger 3.0.5 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/opener/property_tagger.rb +27 -21
- data/lib/opener/property_tagger/aspects_cache.rb +47 -0
- data/lib/opener/property_tagger/cli.rb +4 -1
- data/lib/opener/property_tagger/processor.rb +50 -44
- data/lib/opener/property_tagger/remote_aspects_cache.rb +40 -0
- data/lib/opener/property_tagger/version.rb +5 -3
- data/opener-property-tagger.gemspec +4 -1
- data/task/test.rake +2 -1
- metadata +58 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 168a0501dc6567285dd70b1a304fdc885b6d9493f9ca62863a428068b42d284e
|
4
|
+
data.tar.gz: 6e9dd2b446eca6e75d4644e5cb59ec8ad64ec6f7f88e79266142129ce5ee59eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7a209b8da679c06ea5fce64a11142b22b8b7a3c4da2775f832f4266af152c624f8ddb9e81bf7cbb96d668babc303fedb1243988b243faa2c72a5b7438c47c9b6
|
7
|
+
data.tar.gz: 621b9a11a6c2230aee345fc4d072b8636d1c63b6533960103de475355a54baac7924867eb59176850d7b0aaaccdbc1a4a6eccf9af62652bbaf6f26097163778d
|
@@ -1,12 +1,18 @@
|
|
1
1
|
require 'open3'
|
2
2
|
require 'slop'
|
3
3
|
require 'oga'
|
4
|
+
require 'monitor'
|
5
|
+
require 'httpclient'
|
6
|
+
require 'hashie'
|
7
|
+
require 'json'
|
4
8
|
|
5
9
|
require 'rexml/document'
|
6
10
|
require 'rexml/formatters/pretty'
|
7
11
|
|
8
12
|
require_relative 'property_tagger/version'
|
9
13
|
require_relative 'property_tagger/cli'
|
14
|
+
require_relative 'property_tagger/aspects_cache'
|
15
|
+
require_relative 'property_tagger/remote_aspects_cache'
|
10
16
|
require_relative 'property_tagger/processor'
|
11
17
|
|
12
18
|
module Opener
|
@@ -41,36 +47,36 @@ module Opener
|
|
41
47
|
# @return [String]
|
42
48
|
#
|
43
49
|
def path
|
44
|
-
path
|
50
|
+
return @path if @path
|
51
|
+
|
52
|
+
@path = options[:resource_path] || ENV['RESOURCE_PATH'] ||
|
45
53
|
ENV['PROPERTY_TAGGER_LEXICONS_PATH']
|
54
|
+
return unless @path
|
46
55
|
|
47
|
-
|
48
|
-
|
49
|
-
end
|
56
|
+
@path = File.expand_path @path
|
57
|
+
end
|
50
58
|
|
51
|
-
|
59
|
+
def remote_url
|
60
|
+
@remote_url ||= ENV['PROPERTY_TAGGER_LEXICONS_URL']
|
52
61
|
end
|
53
62
|
|
54
63
|
##
|
55
|
-
# Processes the input
|
56
|
-
# STDERR and an object containing process information.
|
64
|
+
# Processes the input KAF document.
|
57
65
|
#
|
58
|
-
# @param [String] input
|
59
|
-
# @return [
|
66
|
+
# @param [String] input
|
67
|
+
# @return [String]
|
60
68
|
#
|
61
|
-
def run
|
62
|
-
|
69
|
+
def run input
|
70
|
+
timestamp = !options[:no_time]
|
63
71
|
|
64
|
-
|
72
|
+
Processor.new(input,
|
73
|
+
url: remote_url,
|
74
|
+
path: path,
|
75
|
+
timestamp: timestamp,
|
76
|
+
pretty: options[:pretty],
|
77
|
+
).process
|
65
78
|
end
|
66
79
|
|
67
|
-
|
68
|
-
|
69
|
-
def process(input)
|
70
|
-
processor = Processor.new(input, path, !options[:no_time])
|
71
|
-
|
72
|
-
return processor.process
|
73
|
-
end
|
74
|
-
end # PolarityTagger
|
75
|
-
end # Opener
|
80
|
+
end
|
81
|
+
end
|
76
82
|
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Opener
|
2
|
+
class PropertyTagger
|
3
|
+
##
|
4
|
+
# Thread-safe cache for storing the contents of aspect files.
|
5
|
+
#
|
6
|
+
class AspectsCache
|
7
|
+
include MonitorMixin
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super
|
11
|
+
|
12
|
+
@cache = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Returns the aspects for the given file path. If the aspects don't exist
|
17
|
+
# they are first loaded into the cache.
|
18
|
+
#
|
19
|
+
# @param [String] path
|
20
|
+
#
|
21
|
+
def [](path)
|
22
|
+
synchronize do
|
23
|
+
@cache[path] = load_aspects(path) unless @cache.key?(path)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
alias_method :get, :[]
|
28
|
+
|
29
|
+
##
|
30
|
+
# Loads the aspects of the given path.
|
31
|
+
#
|
32
|
+
# @param [String] path
|
33
|
+
#
|
34
|
+
def load_aspects(path)
|
35
|
+
mapping = Hash.new { |hash, key| hash[key] = [] }
|
36
|
+
|
37
|
+
File.foreach(path) do |line|
|
38
|
+
lemma, pos, aspect = line.chomp.split("\t")
|
39
|
+
|
40
|
+
mapping[lemma.to_sym] << aspect
|
41
|
+
end
|
42
|
+
|
43
|
+
return mapping
|
44
|
+
end
|
45
|
+
end # AspectsCache
|
46
|
+
end # PropertyTagger
|
47
|
+
end # Opener
|
@@ -56,10 +56,13 @@ Examples:
|
|
56
56
|
|
57
57
|
on :'no-time', 'Disables adding of timestamps'
|
58
58
|
|
59
|
+
on :ugly, 'Disables pretty formatting of XML (faster)'
|
60
|
+
|
59
61
|
run do |opts, args|
|
60
62
|
tagger = PropertyTagger.new(
|
61
63
|
:args => args,
|
62
|
-
:no_time => opts[:'no-time']
|
64
|
+
:no_time => opts[:'no-time'],
|
65
|
+
:pretty => !opts[:ugly]
|
63
66
|
)
|
64
67
|
|
65
68
|
input = STDIN.tty? ? nil : STDIN.read
|
@@ -4,15 +4,37 @@ module Opener
|
|
4
4
|
# Class that applies property tagging to a given input KAF file.
|
5
5
|
#
|
6
6
|
class Processor
|
7
|
-
attr_accessor :document, :aspects_path, :language, :aspects, :terms,
|
8
|
-
:timestamp
|
9
7
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
@timestamp = timestamp
|
8
|
+
attr_accessor :document
|
9
|
+
attr_accessor :aspects, :aspects_path, :aspects_url
|
10
|
+
attr_accessor :timestamp, :pretty
|
14
11
|
|
12
|
+
##
|
13
|
+
# Global cache used for storing loaded aspects.
|
14
|
+
#
|
15
|
+
# @return [Opener::PropertyTagger::AspectsCache.new]
|
16
|
+
#
|
17
|
+
ASPECTS_CACHE = AspectsCache.new
|
18
|
+
REMOTE_ASPECTS_CACHE = RemoteAspectsCache.new
|
19
|
+
|
20
|
+
##
|
21
|
+
# @param [String|IO] file The KAF file/input to process.
|
22
|
+
# @param [String] aspects_path Path to the aspects.
|
23
|
+
# @param [TrueClass|FalseClass] timestamp Add timestamps to the KAF.
|
24
|
+
# @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
|
25
|
+
# by default due to the performance overhead.
|
26
|
+
#
|
27
|
+
def initialize file, url: nil, path: nil, timestamp: true, pretty: false
|
28
|
+
@document = Oga.parse_xml file
|
15
29
|
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
30
|
+
@timestamp = timestamp
|
31
|
+
@pretty = pretty
|
32
|
+
|
33
|
+
@remote = !url.nil?
|
34
|
+
@aspects_path = path
|
35
|
+
@aspects_url = url
|
36
|
+
|
37
|
+
@aspects = if @remote then REMOTE_ASPECTS_CACHE[language] else ASPECTS_CACHE[aspects_file] end
|
16
38
|
end
|
17
39
|
|
18
40
|
##
|
@@ -20,64 +42,45 @@ module Opener
|
|
20
42
|
# @return [String]
|
21
43
|
#
|
22
44
|
def process
|
23
|
-
@language = get_language
|
24
|
-
@aspects = load_aspects
|
25
|
-
@terms = get_terms
|
26
|
-
|
27
45
|
existing_aspects = extract_aspects
|
28
46
|
|
29
47
|
add_features_layer
|
30
48
|
add_properties_layer
|
31
49
|
|
32
|
-
|
50
|
+
existing_aspects.each_with_index do |(key, value), index|
|
51
|
+
index += 1
|
33
52
|
|
34
|
-
existing_aspects.each_pair do |key,value|
|
35
53
|
add_property(key, value, index)
|
36
|
-
index += 1
|
37
54
|
end
|
38
55
|
|
39
56
|
add_linguistic_processor
|
40
57
|
|
41
|
-
return pretty_print(document)
|
42
|
-
end
|
43
|
-
|
44
|
-
##
|
45
|
-
# Loads the aspects from the txt file
|
46
|
-
# @return [Hash]
|
47
|
-
#
|
48
|
-
def load_aspects
|
49
|
-
aspects_hash = {}
|
50
|
-
|
51
|
-
File.foreach(aspects_file) do |line|
|
52
|
-
lemma, pos, aspect = line.gsub("\n", "").split("\t")
|
53
|
-
|
54
|
-
aspects_hash[lemma.to_sym] = [] unless aspects_hash[lemma.to_sym]
|
55
|
-
aspects_hash[lemma.to_sym] << aspect
|
56
|
-
end
|
57
|
-
|
58
|
-
return aspects_hash
|
58
|
+
return pretty ? pretty_print(document) : document.to_xml
|
59
59
|
end
|
60
60
|
|
61
61
|
##
|
62
62
|
# Get the language of the input file.
|
63
|
+
#
|
63
64
|
# @return [String]
|
64
65
|
#
|
65
|
-
def
|
66
|
-
document.at_xpath('KAF').get('xml:lang')
|
66
|
+
def language
|
67
|
+
return @language ||= document.at_xpath('KAF').get('xml:lang')
|
67
68
|
end
|
68
69
|
|
69
70
|
##
|
70
71
|
# Get the terms from the input file
|
71
72
|
# @return [Hash]
|
72
73
|
#
|
73
|
-
def
|
74
|
-
|
74
|
+
def terms
|
75
|
+
unless @terms
|
76
|
+
@terms = {}
|
75
77
|
|
76
|
-
|
77
|
-
|
78
|
+
document.xpath('KAF/terms/term').each do |term|
|
79
|
+
@terms[term.get('tid').to_sym] = term.get('lemma')
|
80
|
+
end
|
78
81
|
end
|
79
82
|
|
80
|
-
return
|
83
|
+
return @terms
|
81
84
|
end
|
82
85
|
|
83
86
|
##
|
@@ -93,7 +96,7 @@ module Opener
|
|
93
96
|
# lemmas) belong to a property.
|
94
97
|
max_ngram = 2
|
95
98
|
|
96
|
-
uniq_aspects = {}
|
99
|
+
uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
|
97
100
|
|
98
101
|
while current_token < terms.count
|
99
102
|
(0..max_ngram).each do |tam_ngram|
|
@@ -107,7 +110,6 @@ module Opener
|
|
107
110
|
properties.uniq.each do |property|
|
108
111
|
next if !property or property.strip.empty?
|
109
112
|
|
110
|
-
uniq_aspects[property.to_sym] = [] unless uniq_aspects[property.to_sym]
|
111
113
|
uniq_aspects[property.to_sym] << [ids,ngram]
|
112
114
|
end
|
113
115
|
end
|
@@ -223,9 +225,13 @@ module Opener
|
|
223
225
|
return !!document.at_xpath('KAF')
|
224
226
|
end
|
225
227
|
|
228
|
+
##
|
229
|
+
# @return [String]
|
230
|
+
#
|
226
231
|
def aspects_file
|
227
|
-
|
232
|
+
@aspects_file ||= File.expand_path "#{aspects_path}/#{language}.txt", __FILE__
|
228
233
|
end
|
229
|
-
|
230
|
-
|
231
|
-
end
|
234
|
+
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Opener
|
2
|
+
class PropertyTagger
|
3
|
+
##
|
4
|
+
# Thread-safe cache for storing the contents of remote aspects.
|
5
|
+
#
|
6
|
+
class RemoteAspectsCache
|
7
|
+
|
8
|
+
include MonitorMixin
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
super
|
12
|
+
|
13
|
+
@url = ENV['PROPERTY_TAGGER_LEXICONS_URL']
|
14
|
+
@cache = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def [] lang
|
18
|
+
synchronize do
|
19
|
+
@cache[lang] ||= load_aspects lang
|
20
|
+
end
|
21
|
+
end
|
22
|
+
alias_method :get, :[]
|
23
|
+
|
24
|
+
def load_aspects lang
|
25
|
+
mapping = Hash.new{ |hash, key| hash[key] = [] }
|
26
|
+
url = "#{@url}&language_code=#{lang}"
|
27
|
+
lexicons = JSON.parse HTTPClient.new.get(url).body
|
28
|
+
lexicons = lexicons['data'].map{ |l| Hashie::Mash.new l }
|
29
|
+
puts "#{lang}: loaded aspects from #{url}"
|
30
|
+
|
31
|
+
lexicons.each do |l|
|
32
|
+
mapping[l.lemma.to_sym] << l.aspect
|
33
|
+
end
|
34
|
+
|
35
|
+
return mapping
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -28,9 +28,12 @@ Gem::Specification.new do |gem|
|
|
28
28
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
29
29
|
gem.add_dependency 'opener-core', '~> 2.2'
|
30
30
|
|
31
|
-
gem.add_dependency 'oga'
|
31
|
+
gem.add_dependency 'oga', ['~> 1.0', '>= 1.3.1']
|
32
|
+
gem.add_dependency 'httpclient'
|
33
|
+
gem.add_dependency 'hashie'
|
32
34
|
|
33
35
|
gem.add_development_dependency 'rspec', '~> 3.0'
|
34
36
|
gem.add_development_dependency 'cucumber'
|
35
37
|
gem.add_development_dependency 'rake'
|
38
|
+
gem.add_development_dependency 'benchmark-ips', '~> 2.0'
|
36
39
|
end
|
data/task/test.rake
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-09-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
@@ -54,6 +54,40 @@ dependencies:
|
|
54
54
|
version: '2.2'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: oga
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.0'
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: 1.3.1
|
65
|
+
type: :runtime
|
66
|
+
prerelease: false
|
67
|
+
version_requirements: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - "~>"
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '1.0'
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: 1.3.1
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: httpclient
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0'
|
82
|
+
type: :runtime
|
83
|
+
prerelease: false
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
- !ruby/object:Gem::Dependency
|
90
|
+
name: hashie
|
57
91
|
requirement: !ruby/object:Gem::Requirement
|
58
92
|
requirements:
|
59
93
|
- - ">="
|
@@ -108,8 +142,22 @@ dependencies:
|
|
108
142
|
- - ">="
|
109
143
|
- !ruby/object:Gem::Version
|
110
144
|
version: '0'
|
145
|
+
- !ruby/object:Gem::Dependency
|
146
|
+
name: benchmark-ips
|
147
|
+
requirement: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - "~>"
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '2.0'
|
152
|
+
type: :development
|
153
|
+
prerelease: false
|
154
|
+
version_requirements: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - "~>"
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '2.0'
|
111
159
|
description: Property tagger for hotels in Dutch and English.
|
112
|
-
email:
|
160
|
+
email:
|
113
161
|
executables:
|
114
162
|
- property-tagger
|
115
163
|
- property-tagger-daemon
|
@@ -125,9 +173,11 @@ files:
|
|
125
173
|
- config.ru
|
126
174
|
- exec/property-tagger.rb
|
127
175
|
- lib/opener/property_tagger.rb
|
176
|
+
- lib/opener/property_tagger/aspects_cache.rb
|
128
177
|
- lib/opener/property_tagger/cli.rb
|
129
178
|
- lib/opener/property_tagger/processor.rb
|
130
179
|
- lib/opener/property_tagger/public/markdown.css
|
180
|
+
- lib/opener/property_tagger/remote_aspects_cache.rb
|
131
181
|
- lib/opener/property_tagger/server.rb
|
132
182
|
- lib/opener/property_tagger/version.rb
|
133
183
|
- lib/opener/property_tagger/views/index.erb
|
@@ -139,7 +189,7 @@ homepage: http://opener-project.github.com/
|
|
139
189
|
licenses:
|
140
190
|
- Apache 2.0
|
141
191
|
metadata: {}
|
142
|
-
post_install_message:
|
192
|
+
post_install_message:
|
143
193
|
rdoc_options: []
|
144
194
|
require_paths:
|
145
195
|
- lib
|
@@ -154,10 +204,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
204
|
- !ruby/object:Gem::Version
|
155
205
|
version: '0'
|
156
206
|
requirements: []
|
157
|
-
rubyforge_project:
|
158
|
-
rubygems_version: 2.
|
159
|
-
signing_key:
|
207
|
+
rubyforge_project:
|
208
|
+
rubygems_version: 2.7.8
|
209
|
+
signing_key:
|
160
210
|
specification_version: 4
|
161
211
|
summary: Property tagger for hotels in Dutch and English.
|
162
212
|
test_files: []
|
163
|
-
has_rdoc:
|