opener-property-tagger 3.0.2 → 3.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +0 -7
- data/lib/opener/property_tagger/processor.rb +227 -0
- data/lib/opener/property_tagger/version.rb +1 -1
- data/lib/opener/property_tagger.rb +11 -41
- data/opener-property-tagger.gemspec +2 -8
- data/task/test.rake +1 -1
- metadata +60 -82
- data/core/extract_aspects.py +0 -18
- data/core/hotel_property_tagger_nl_en.py +0 -138
- data/ext/hack/Rakefile +0 -8
- data/pre_install_requirements.txt +0 -1
- data/task/compile.rake +0 -2
- data/task/python.rake +0 -11
- data/task/requirements.rake +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6bd138d2aeb528bf87f83fde2af933ca3ebce6dd
|
4
|
+
data.tar.gz: e0e64e5f709effb0e2671b08790ee5c5e4afb5b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7228f65dde150175167d2ac0f72cac888f8649467be2bd983ba28d1d9738222f20aa8bec7678813f4200756529558159764b4edfa420c2d43d2aee9593f9dccc
|
7
|
+
data.tar.gz: 0303f61b4f14aedc1c37c4378f3a030d602280b261f60013426ec3d9d986356dabfebbbf6f7a665f04fd9f7b65ac0901a67567416eabea5725db96699b1ee67c
|
data/README.md
CHANGED
@@ -118,8 +118,6 @@ At least you need the following system setup:
|
|
118
118
|
### Depenencies for normal use:
|
119
119
|
|
120
120
|
* Ruby 1.9.3 or newer
|
121
|
-
* Python 2.6
|
122
|
-
* lxml installed
|
123
121
|
* libarchive (for running the tests and such), on Debian/Ubuntu based systems
|
124
122
|
this can be installed using `sudo apt-get install libarchive-dev`
|
125
123
|
|
@@ -137,11 +135,6 @@ is the word or span of words (in this case use whitespaces), then the part of
|
|
137
135
|
speech (which actually it is not use, you can include a dummy label) and
|
138
136
|
finally the aspect class associated with the word.
|
139
137
|
|
140
|
-
## The Core
|
141
|
-
|
142
|
-
The component is a fat wrapper around the actual language technology core. You
|
143
|
-
can find the core technolies (python) in the `/core` directory.
|
144
|
-
|
145
138
|
## Where to go from here
|
146
139
|
|
147
140
|
* [Check the project website](http://opener-project.github.io)
|
@@ -0,0 +1,227 @@
|
|
1
|
+
module Opener
|
2
|
+
class PropertyTagger
|
3
|
+
##
|
4
|
+
# Class that applies property tagging to a given input KAF file.
|
5
|
+
#
|
6
|
+
class Processor
|
7
|
+
attr_accessor :document, :aspects_path, :language, :aspects, :terms,
|
8
|
+
:timestamp
|
9
|
+
|
10
|
+
def initialize(file, aspects_path, timestamp = true)
|
11
|
+
@document = Oga.parse_xml(file)
|
12
|
+
@aspects_path = aspects_path
|
13
|
+
@timestamp = timestamp
|
14
|
+
|
15
|
+
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
16
|
+
end
|
17
|
+
|
18
|
+
##
|
19
|
+
# Processes the input and returns the new KAF output.
|
20
|
+
# @return [String]
|
21
|
+
#
|
22
|
+
def process
|
23
|
+
@language = get_language
|
24
|
+
@aspects = load_aspects
|
25
|
+
@terms = get_terms
|
26
|
+
|
27
|
+
existing_aspects = extract_aspects
|
28
|
+
|
29
|
+
add_features_layer
|
30
|
+
add_properties_layer
|
31
|
+
|
32
|
+
index = 1
|
33
|
+
|
34
|
+
existing_aspects.each_pair do |key,value|
|
35
|
+
add_property(key, value, index)
|
36
|
+
index += 1
|
37
|
+
end
|
38
|
+
|
39
|
+
add_linguistic_processor
|
40
|
+
|
41
|
+
return pretty_print(document)
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# Loads the aspects from the txt file
|
46
|
+
# @return [Hash]
|
47
|
+
#
|
48
|
+
def load_aspects
|
49
|
+
aspects_hash = {}
|
50
|
+
|
51
|
+
File.foreach(aspects_file) do |line|
|
52
|
+
lemma, pos, aspect = line.gsub("\n", "").split("\t")
|
53
|
+
|
54
|
+
aspects_hash[lemma.to_sym] = [] unless aspects_hash[lemma.to_sym]
|
55
|
+
aspects_hash[lemma.to_sym] << aspect
|
56
|
+
end
|
57
|
+
|
58
|
+
return aspects_hash
|
59
|
+
end
|
60
|
+
|
61
|
+
##
|
62
|
+
# Get the language of the input file.
|
63
|
+
# @return [String]
|
64
|
+
#
|
65
|
+
def get_language
|
66
|
+
document.at_xpath('KAF').get('xml:lang')
|
67
|
+
end
|
68
|
+
|
69
|
+
##
|
70
|
+
# Get the terms from the input file
|
71
|
+
# @return [Hash]
|
72
|
+
#
|
73
|
+
def get_terms
|
74
|
+
terms_hash = {}
|
75
|
+
|
76
|
+
document.xpath('KAF/terms/term').each do |term|
|
77
|
+
terms_hash[term.get('tid').to_sym] = term.get('lemma')
|
78
|
+
end
|
79
|
+
|
80
|
+
return terms_hash
|
81
|
+
end
|
82
|
+
|
83
|
+
##
|
84
|
+
# Check which terms belong to an aspect (property)
|
85
|
+
# @return [Hash]
|
86
|
+
#
|
87
|
+
def extract_aspects
|
88
|
+
term_ids = terms.keys
|
89
|
+
lemmas = terms.values
|
90
|
+
|
91
|
+
current_token = 0
|
92
|
+
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
93
|
+
# lemmas) belong to a property.
|
94
|
+
max_ngram = 2
|
95
|
+
|
96
|
+
uniq_aspects = {}
|
97
|
+
|
98
|
+
while current_token < terms.count
|
99
|
+
(0..max_ngram).each do |tam_ngram|
|
100
|
+
if current_token + tam_ngram <= terms.count
|
101
|
+
ngram = lemmas[current_token..current_token+tam_ngram].join(" ").downcase
|
102
|
+
if aspects[ngram.to_sym]
|
103
|
+
properties = aspects[ngram.to_sym]
|
104
|
+
ids = term_ids[current_token..current_token+tam_ngram]
|
105
|
+
properties.uniq.reject{|p| p.gsub(" ", "").empty?}.each do |property|
|
106
|
+
uniq_aspects[property.to_sym] = [] unless uniq_aspects[property.to_sym]
|
107
|
+
uniq_aspects[property.to_sym] << [ids,ngram]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
current_token += 1
|
113
|
+
end
|
114
|
+
|
115
|
+
return Hash[uniq_aspects.sort]
|
116
|
+
end
|
117
|
+
|
118
|
+
##
|
119
|
+
# Remove the features layer from the KAF file if it exists and add a new
|
120
|
+
# one.
|
121
|
+
def add_features_layer
|
122
|
+
existing = document.at_xpath('KAF/features')
|
123
|
+
|
124
|
+
existing.remove if existing
|
125
|
+
|
126
|
+
new_node('features', 'KAF')
|
127
|
+
end
|
128
|
+
|
129
|
+
##
|
130
|
+
# Add the properties layer as a child to the features layer.
|
131
|
+
def add_properties_layer
|
132
|
+
new_node("properties", "KAF/features")
|
133
|
+
end
|
134
|
+
|
135
|
+
def add_property(key, value, index)
|
136
|
+
property_node = new_node("property", "KAF/features/properties")
|
137
|
+
|
138
|
+
property_node.set('lemma', key.to_s)
|
139
|
+
property_node.set('pid', "p#{index.to_s}")
|
140
|
+
|
141
|
+
references_node = new_node("references", property_node)
|
142
|
+
|
143
|
+
value.uniq.each do |v|
|
144
|
+
comment = Oga::XML::Comment.new(:text => v.last)
|
145
|
+
|
146
|
+
references_node.children << comment
|
147
|
+
|
148
|
+
span_node = new_node("span", references_node)
|
149
|
+
|
150
|
+
v.first.each do |val|
|
151
|
+
target_node = new_node("target", span_node)
|
152
|
+
|
153
|
+
target_node.set('id', val.to_s)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def add_linguistic_processor
|
159
|
+
description = 'VUA property tagger'
|
160
|
+
last_edited = '16jan2015'
|
161
|
+
version = '2.0'
|
162
|
+
|
163
|
+
node = new_node('linguisticProcessors', 'KAF/kafHeader')
|
164
|
+
node.set('layer', 'features')
|
165
|
+
|
166
|
+
lp_node = new_node('lp', node)
|
167
|
+
|
168
|
+
lp_node.set('version', "#{last_edited}-#{version}")
|
169
|
+
lp_node.set('name', description)
|
170
|
+
|
171
|
+
if timestamp
|
172
|
+
format = '%Y-%m-%dT%H:%M:%S%Z'
|
173
|
+
|
174
|
+
lp_node.set('timestamp', Time.now.strftime(format))
|
175
|
+
else
|
176
|
+
lp_node.set('timestamp', '*')
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
##
|
181
|
+
# Format the output document properly.
|
182
|
+
#
|
183
|
+
# TODO: this should be handled by Oga in a nice way.
|
184
|
+
#
|
185
|
+
# @return [String]
|
186
|
+
#
|
187
|
+
def pretty_print(document)
|
188
|
+
doc = REXML::Document.new document.to_xml
|
189
|
+
doc.context[:attribute_quote] = :quote
|
190
|
+
out = ""
|
191
|
+
formatter = REXML::Formatters::Pretty.new
|
192
|
+
formatter.compact = true
|
193
|
+
formatter.write(doc, out)
|
194
|
+
|
195
|
+
return out.strip
|
196
|
+
end
|
197
|
+
|
198
|
+
protected
|
199
|
+
|
200
|
+
def new_node(tag, parent)
|
201
|
+
if parent.is_a?(String)
|
202
|
+
parent_node = document.at_xpath(parent)
|
203
|
+
else
|
204
|
+
parent_node = parent
|
205
|
+
end
|
206
|
+
|
207
|
+
node = Oga::XML::Element.new(:name => tag)
|
208
|
+
|
209
|
+
parent_node.children << node
|
210
|
+
|
211
|
+
return node
|
212
|
+
end
|
213
|
+
|
214
|
+
##
|
215
|
+
# Check if input is a KAF file.
|
216
|
+
# @return [Boolean]
|
217
|
+
#
|
218
|
+
def is_kaf?
|
219
|
+
return !!document.at_xpath('KAF')
|
220
|
+
end
|
221
|
+
|
222
|
+
def aspects_file
|
223
|
+
return File.expand_path("#{aspects_path}/#{language}.txt", __FILE__)
|
224
|
+
end
|
225
|
+
end # Processor
|
226
|
+
end # PropertyTagger
|
227
|
+
end # Opener
|
@@ -1,8 +1,13 @@
|
|
1
1
|
require 'open3'
|
2
2
|
require 'slop'
|
3
|
+
require 'oga'
|
4
|
+
|
5
|
+
require 'rexml/document'
|
6
|
+
require 'rexml/formatters/pretty'
|
3
7
|
|
4
8
|
require_relative 'property_tagger/version'
|
5
9
|
require_relative 'property_tagger/cli'
|
10
|
+
require_relative 'property_tagger/processor'
|
6
11
|
|
7
12
|
module Opener
|
8
13
|
##
|
@@ -28,15 +33,6 @@ module Opener
|
|
28
33
|
@options = options
|
29
34
|
end
|
30
35
|
|
31
|
-
##
|
32
|
-
# Returns a String containing the command to use for executing the kernel.
|
33
|
-
#
|
34
|
-
# @return [String]
|
35
|
-
#
|
36
|
-
def command
|
37
|
-
return "python -E #{kernel} #{args.join(' ')} --path #{path}"
|
38
|
-
end
|
39
|
-
|
40
36
|
##
|
41
37
|
# Get the resource path for the lexicon files, defaults to an ENV variable
|
42
38
|
#
|
@@ -50,7 +46,7 @@ module Opener
|
|
50
46
|
raise ArgumentError, 'No lexicon path provided'
|
51
47
|
end
|
52
48
|
|
53
|
-
return path
|
49
|
+
return File.expand_path(path)
|
54
50
|
end
|
55
51
|
|
56
52
|
##
|
@@ -61,41 +57,15 @@ module Opener
|
|
61
57
|
# @return [Array]
|
62
58
|
#
|
63
59
|
def run(input)
|
64
|
-
|
65
|
-
|
66
|
-
raise stderr unless process.success?
|
60
|
+
output = process(input)
|
67
61
|
|
68
|
-
return
|
62
|
+
return output
|
69
63
|
end
|
70
64
|
|
71
65
|
protected
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
# this is a workaround
|
76
|
-
#
|
77
|
-
def capture(input)
|
78
|
-
Open3.popen3(*command.split(" ")) {|i, o, e, t|
|
79
|
-
out_reader = Thread.new { o.read }
|
80
|
-
err_reader = Thread.new { e.read }
|
81
|
-
i.write input
|
82
|
-
i.close
|
83
|
-
[out_reader.value, err_reader.value, t.value]
|
84
|
-
}
|
85
|
-
end
|
86
|
-
|
87
|
-
##
|
88
|
-
# @return [String]
|
89
|
-
#
|
90
|
-
def core_dir
|
91
|
-
return File.expand_path('../../../core', __FILE__)
|
92
|
-
end
|
93
|
-
|
94
|
-
##
|
95
|
-
# @return [String]
|
96
|
-
#
|
97
|
-
def kernel
|
98
|
-
return File.join(core_dir, 'hotel_property_tagger_nl_en.py')
|
66
|
+
def process(input)
|
67
|
+
processor = Opener::PropertyTagger::Processor.new(input, path, !args.include?("--no-time"))
|
68
|
+
return processor.process
|
99
69
|
end
|
100
70
|
end # PolarityTagger
|
101
71
|
end # Opener
|
@@ -7,20 +7,15 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.summary = 'Property tagger for hotels in Dutch and English.'
|
8
8
|
gem.description = gem.summary
|
9
9
|
gem.homepage = 'http://opener-project.github.com/'
|
10
|
-
gem.extensions = ['ext/hack/Rakefile']
|
11
10
|
|
12
11
|
gem.license = 'Apache 2.0'
|
13
12
|
|
14
13
|
gem.required_ruby_version = '>= 1.9.2'
|
15
14
|
|
16
15
|
gem.files = Dir.glob([
|
17
|
-
'core/data/**/*',
|
18
|
-
'core/*.py',
|
19
|
-
'ext/**/*',
|
20
16
|
'lib/**/*',
|
21
17
|
'config.ru',
|
22
18
|
'*.gemspec',
|
23
|
-
'*_requirements.txt',
|
24
19
|
'README.md',
|
25
20
|
'LICENSE.txt',
|
26
21
|
'exec/**/*',
|
@@ -33,10 +28,9 @@ Gem::Specification.new do |gem|
|
|
33
28
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
34
29
|
gem.add_dependency 'opener-core', '~> 2.2'
|
35
30
|
|
36
|
-
gem.add_dependency '
|
37
|
-
gem.add_dependency 'rake'
|
38
|
-
gem.add_dependency 'cliver'
|
31
|
+
gem.add_dependency 'oga'
|
39
32
|
|
40
33
|
gem.add_development_dependency 'rspec', '~> 3.0'
|
41
34
|
gem.add_development_dependency 'cucumber'
|
35
|
+
gem.add_development_dependency 'rake'
|
42
36
|
end
|
data/task/test.rake
CHANGED
metadata
CHANGED
@@ -1,184 +1,162 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '2.2'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
15
|
version_requirements: !ruby/object:Gem::Requirement
|
23
16
|
requirements:
|
24
|
-
- -
|
17
|
+
- - ~>
|
25
18
|
- !ruby/object:Gem::Version
|
26
19
|
version: '2.2'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: opener-webservice
|
29
20
|
requirement: !ruby/object:Gem::Requirement
|
30
21
|
requirements:
|
31
|
-
- -
|
22
|
+
- - ~>
|
32
23
|
- !ruby/object:Gem::Version
|
33
|
-
version: '2.
|
34
|
-
type: :runtime
|
24
|
+
version: '2.2'
|
35
25
|
prerelease: false
|
26
|
+
type: :runtime
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: opener-webservice
|
36
29
|
version_requirements: !ruby/object:Gem::Requirement
|
37
30
|
requirements:
|
38
|
-
- -
|
31
|
+
- - ~>
|
39
32
|
- !ruby/object:Gem::Version
|
40
33
|
version: '2.1'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: opener-core
|
43
34
|
requirement: !ruby/object:Gem::Requirement
|
44
35
|
requirements:
|
45
|
-
- -
|
36
|
+
- - ~>
|
46
37
|
- !ruby/object:Gem::Version
|
47
|
-
version: '2.
|
48
|
-
type: :runtime
|
38
|
+
version: '2.1'
|
49
39
|
prerelease: false
|
40
|
+
type: :runtime
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: opener-core
|
50
43
|
version_requirements: !ruby/object:Gem::Requirement
|
51
44
|
requirements:
|
52
|
-
- -
|
45
|
+
- - ~>
|
53
46
|
- !ruby/object:Gem::Version
|
54
47
|
version: '2.2'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: nokogiri
|
57
48
|
requirement: !ruby/object:Gem::Requirement
|
58
49
|
requirements:
|
59
|
-
- -
|
50
|
+
- - ~>
|
60
51
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
62
|
-
type: :runtime
|
52
|
+
version: '2.2'
|
63
53
|
prerelease: false
|
54
|
+
type: :runtime
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: oga
|
64
57
|
version_requirements: !ruby/object:Gem::Requirement
|
65
58
|
requirements:
|
66
|
-
- -
|
59
|
+
- - '>='
|
67
60
|
- !ruby/object:Gem::Version
|
68
61
|
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: rake
|
71
62
|
requirement: !ruby/object:Gem::Requirement
|
72
63
|
requirements:
|
73
|
-
- -
|
64
|
+
- - '>='
|
74
65
|
- !ruby/object:Gem::Version
|
75
66
|
version: '0'
|
76
|
-
type: :runtime
|
77
67
|
prerelease: false
|
68
|
+
type: :runtime
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
78
71
|
version_requirements: !ruby/object:Gem::Requirement
|
79
72
|
requirements:
|
80
|
-
- -
|
73
|
+
- - ~>
|
81
74
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: cliver
|
75
|
+
version: '3.0'
|
85
76
|
requirement: !ruby/object:Gem::Requirement
|
86
77
|
requirements:
|
87
|
-
- -
|
78
|
+
- - ~>
|
88
79
|
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :runtime
|
80
|
+
version: '3.0'
|
91
81
|
prerelease: false
|
82
|
+
type: :development
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: cucumber
|
92
85
|
version_requirements: !ruby/object:Gem::Requirement
|
93
86
|
requirements:
|
94
|
-
- -
|
87
|
+
- - '>='
|
95
88
|
- !ruby/object:Gem::Version
|
96
89
|
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: rspec
|
99
90
|
requirement: !ruby/object:Gem::Requirement
|
100
91
|
requirements:
|
101
|
-
- -
|
92
|
+
- - '>='
|
102
93
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
104
|
-
type: :development
|
94
|
+
version: '0'
|
105
95
|
prerelease: false
|
96
|
+
type: :development
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rake
|
106
99
|
version_requirements: !ruby/object:Gem::Requirement
|
107
100
|
requirements:
|
108
|
-
- -
|
101
|
+
- - '>='
|
109
102
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
111
|
-
- !ruby/object:Gem::Dependency
|
112
|
-
name: cucumber
|
103
|
+
version: '0'
|
113
104
|
requirement: !ruby/object:Gem::Requirement
|
114
105
|
requirements:
|
115
|
-
- -
|
106
|
+
- - '>='
|
116
107
|
- !ruby/object:Gem::Version
|
117
108
|
version: '0'
|
118
|
-
type: :development
|
119
109
|
prerelease: false
|
120
|
-
|
121
|
-
requirements:
|
122
|
-
- - ">="
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: '0'
|
110
|
+
type: :development
|
125
111
|
description: Property tagger for hotels in Dutch and English.
|
126
|
-
email:
|
112
|
+
email:
|
127
113
|
executables:
|
128
114
|
- property-tagger
|
129
115
|
- property-tagger-daemon
|
130
116
|
- property-tagger-server
|
131
|
-
extensions:
|
132
|
-
- ext/hack/Rakefile
|
117
|
+
extensions: []
|
133
118
|
extra_rdoc_files: []
|
134
119
|
files:
|
135
|
-
- LICENSE.txt
|
136
|
-
- README.md
|
137
|
-
- bin/property-tagger
|
138
|
-
- bin/property-tagger-daemon
|
139
|
-
- bin/property-tagger-server
|
140
|
-
- config.ru
|
141
|
-
- core/extract_aspects.py
|
142
|
-
- core/hotel_property_tagger_nl_en.py
|
143
|
-
- exec/property-tagger.rb
|
144
|
-
- ext/hack/Rakefile
|
145
120
|
- lib/opener/property_tagger.rb
|
146
121
|
- lib/opener/property_tagger/cli.rb
|
147
|
-
- lib/opener/property_tagger/
|
122
|
+
- lib/opener/property_tagger/processor.rb
|
148
123
|
- lib/opener/property_tagger/server.rb
|
149
124
|
- lib/opener/property_tagger/version.rb
|
125
|
+
- lib/opener/property_tagger/public/markdown.css
|
150
126
|
- lib/opener/property_tagger/views/index.erb
|
151
127
|
- lib/opener/property_tagger/views/result.erb
|
128
|
+
- config.ru
|
152
129
|
- opener-property-tagger.gemspec
|
153
|
-
-
|
154
|
-
-
|
130
|
+
- README.md
|
131
|
+
- LICENSE.txt
|
132
|
+
- exec/property-tagger.rb
|
155
133
|
- task/lexicons.rake
|
156
|
-
- task/python.rake
|
157
|
-
- task/requirements.rake
|
158
134
|
- task/test.rake
|
135
|
+
- bin/property-tagger
|
136
|
+
- bin/property-tagger-daemon
|
137
|
+
- bin/property-tagger-server
|
159
138
|
homepage: http://opener-project.github.com/
|
160
139
|
licenses:
|
161
140
|
- Apache 2.0
|
162
141
|
metadata: {}
|
163
|
-
post_install_message:
|
142
|
+
post_install_message:
|
164
143
|
rdoc_options: []
|
165
144
|
require_paths:
|
166
145
|
- lib
|
167
146
|
required_ruby_version: !ruby/object:Gem::Requirement
|
168
147
|
requirements:
|
169
|
-
- -
|
148
|
+
- - '>='
|
170
149
|
- !ruby/object:Gem::Version
|
171
150
|
version: 1.9.2
|
172
151
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
173
152
|
requirements:
|
174
|
-
- -
|
153
|
+
- - '>='
|
175
154
|
- !ruby/object:Gem::Version
|
176
155
|
version: '0'
|
177
156
|
requirements: []
|
178
|
-
rubyforge_project:
|
179
|
-
rubygems_version: 2.
|
180
|
-
signing_key:
|
157
|
+
rubyforge_project:
|
158
|
+
rubygems_version: 2.1.9
|
159
|
+
signing_key:
|
181
160
|
specification_version: 4
|
182
161
|
summary: Property tagger for hotels in Dutch and English.
|
183
162
|
test_files: []
|
184
|
-
has_rdoc:
|
data/core/extract_aspects.py
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
|
3
|
-
from lxml import etree
|
4
|
-
import sys
|
5
|
-
#filename='/Users/ruben/CODE/VU-sentiment-lexicon-xml/VUSentimentLexicon/EN-lexicon/Sentiment-English-HotelDomain.xml'
|
6
|
-
|
7
|
-
root = etree.parse(sys.stdin).getroot()
|
8
|
-
|
9
|
-
for element in root.findall('Lexicon/LexicalEntry'):
|
10
|
-
ele_lemma = element.findall('Lemma')[0]
|
11
|
-
ele_domain = element.findall('Sense/Domain')[0]
|
12
|
-
pos = element.get('partOfSpeech','unknown_pos')
|
13
|
-
if ele_lemma is not None and ele_domain is not None:
|
14
|
-
lemma = ele_lemma.get('writtenForm','').lower()
|
15
|
-
aspect = ele_domain.get('aspect','').lower()
|
16
|
-
if lemma!='' and aspect!='':
|
17
|
-
print lemma.encode('utf-8')+'\t'+pos.encode('utf-8')+'\t'+aspect.encode('utf-8')
|
18
|
-
|
@@ -1,138 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
|
3
|
-
import sys
|
4
|
-
import argparse
|
5
|
-
|
6
|
-
import codecs
|
7
|
-
import os
|
8
|
-
|
9
|
-
this_folder = os.path.dirname(os.path.realpath(__file__))
|
10
|
-
|
11
|
-
# This updates the load path to ensure that the local site-packages directory
|
12
|
-
# can be used to load packages (e.g. a locally installed copy of lxml).
|
13
|
-
sys.path.append(os.path.join(this_folder, 'site-packages/pre_install'))
|
14
|
-
|
15
|
-
from VUKafParserPy import KafParser
|
16
|
-
from lxml import etree
|
17
|
-
from collections import defaultdict
|
18
|
-
|
19
|
-
__desc='VUA property tagger'
|
20
|
-
__last_edited='20may2014'
|
21
|
-
__version='1.0'
|
22
|
-
|
23
|
-
###
|
24
|
-
__module_dir = os.path.dirname(__file__)
|
25
|
-
max_ngram = 1
|
26
|
-
verbose = False
|
27
|
-
##
|
28
|
-
|
29
|
-
|
30
|
-
########################################
|
31
|
-
## Format of the file:
|
32
|
-
#lemma pos aspect
|
33
|
-
#lemma pos aspect
|
34
|
-
########################################
|
35
|
-
def loadAspects(my_lang,this_file=None):
|
36
|
-
my_aspects = {}
|
37
|
-
if this_file is not None:
|
38
|
-
aspects_filename = this_file
|
39
|
-
else:
|
40
|
-
filename = "{0}.txt".format(my_lang)
|
41
|
-
print>>sys.stderr, "filename thingy",filename
|
42
|
-
print>>sys.stderr, "path thingy",arguments.path
|
43
|
-
aspects_filename = os.path.join(arguments.path,filename)
|
44
|
-
|
45
|
-
if not os.path.exists(aspects_filename):
|
46
|
-
print>>sys.stderr,'ERROR: file with aspects for the language',my_lang,'not found in',aspects_filename
|
47
|
-
else:
|
48
|
-
fic = codecs.open(aspects_filename,'r','utf-8')
|
49
|
-
for line in fic:
|
50
|
-
fields = line.strip().split('\t')
|
51
|
-
if len(fields) == 3:
|
52
|
-
lemma,pos,aspect = fields
|
53
|
-
my_aspects[lemma] = aspect
|
54
|
-
fic.close()
|
55
|
-
return aspects_filename, my_aspects
|
56
|
-
########################################
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
###### MAIN ########
|
61
|
-
|
62
|
-
argument_parser = argparse.ArgumentParser(description='Tags a text with polarities at lemma level')
|
63
|
-
argument_parser.add_argument("--no-time",action="store_false", default=True, dest="my_time_stamp",help="For not including timestamp in header")
|
64
|
-
argument_parser.add_argument("--lexicon", action="store", default=None, dest="lexicon", help="Force to use this lexicon")
|
65
|
-
argument_parser.add_argument("--path", action="store", default=None, dest="path", help="Set the path where the property aspects are found.")
|
66
|
-
|
67
|
-
arguments = argument_parser.parse_args()
|
68
|
-
|
69
|
-
if not sys.stdin.isatty():
|
70
|
-
## READING FROM A PIPE
|
71
|
-
pass
|
72
|
-
else:
|
73
|
-
print>>sys.stderr,'Input stream required.'
|
74
|
-
print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
|
75
|
-
print>>sys.stderr,sys.argv[0]+' -h for help'
|
76
|
-
sys.exit(-1)
|
77
|
-
|
78
|
-
|
79
|
-
## Load the tree and the list of terms with the id
|
80
|
-
my_data = []
|
81
|
-
try:
|
82
|
-
my_kaf_tree = KafParser(sys.stdin)
|
83
|
-
except Exception as e:
|
84
|
-
print>>sys.stdout,'Error parsing input. Input is required to be KAF'
|
85
|
-
print>>sys.stdout,str(e)
|
86
|
-
sys.exit(2)
|
87
|
-
|
88
|
-
|
89
|
-
## Get language from the KAF file
|
90
|
-
my_lang = my_kaf_tree.getLanguage()
|
91
|
-
|
92
|
-
my_aspects_filename = my_aspects = None
|
93
|
-
if arguments.lexicon is None:
|
94
|
-
if my_lang not in ['nl','en','de','fr','it','es']:
|
95
|
-
print>>sys.stdout,'Error in the language specified in your KAF. The language is ',my_lang,' and possible values for this module '
|
96
|
-
print>>sys.stdout,'are nl for Dutch ,en for English, es Spanish, fr French, it Italian or de German'
|
97
|
-
sys.exit(1)
|
98
|
-
|
99
|
-
my_aspects_filename, my_aspects = loadAspects(my_lang)
|
100
|
-
else:
|
101
|
-
my_aspects_filename, my_aspects = loadAspects(my_lang,this_file=arguments.lexicon)
|
102
|
-
|
103
|
-
if verbose:
|
104
|
-
print>>sys.stderr,'Loaded ',len(my_aspects),'aspects from',my_aspects_filename
|
105
|
-
|
106
|
-
|
107
|
-
for term in my_kaf_tree.getTerms():
|
108
|
-
my_data.append((term.getLemma(),term.getId()))
|
109
|
-
if verbose: print>>sys.stderr,'Number of terms in the kaf file:',len(my_data)
|
110
|
-
|
111
|
-
|
112
|
-
current_token = found = 0
|
113
|
-
uniq_aspects = defaultdict(list)
|
114
|
-
while current_token < len(my_data):
|
115
|
-
for tam_ngram in range(1,max_ngram+1):
|
116
|
-
# Build an n-gram of size tam_ngram and beginning in current_token
|
117
|
-
if current_token + tam_ngram <= len(my_data):
|
118
|
-
ngram = ' '.join(lemma for lemma,_ in my_data[current_token:current_token+tam_ngram])
|
119
|
-
aspect = my_aspects.get(ngram.lower(),None)
|
120
|
-
if aspect is not None:
|
121
|
-
list_of_ids = [id for _,id in my_data[current_token:current_token+tam_ngram]]
|
122
|
-
uniq_aspects[aspect].append((list_of_ids,ngram))
|
123
|
-
current_token += 1
|
124
|
-
|
125
|
-
|
126
|
-
## Code for generating the propery layer included in the Parser
|
127
|
-
for aspect, list_of_lists in uniq_aspects.items():
|
128
|
-
for list_of_ids, str_text in list_of_lists:
|
129
|
-
my_kaf_tree.add_property(aspect,list_of_ids,str_text)
|
130
|
-
|
131
|
-
my_kaf_tree.addLinguisticProcessor(__desc,__last_edited+'_'+__version,'features', arguments.my_time_stamp)
|
132
|
-
my_kaf_tree.saveToFile(sys.stdout)
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
data/ext/hack/Rakefile
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
import File.expand_path('../../../task/requirements.rake', __FILE__)
|
2
|
-
import File.expand_path('../../../task/python.rake', __FILE__)
|
3
|
-
|
4
|
-
task :default => :requirements do
|
5
|
-
Dir.chdir(File.expand_path('../../../', __FILE__)) do
|
6
|
-
Rake::Task['core/site-packages/pre_install'].invoke
|
7
|
-
end
|
8
|
-
end
|
@@ -1 +0,0 @@
|
|
1
|
-
https://github.com/opener-project/VU-kaf-parser/archive/master.zip#egg=VUKafParserPy
|
data/task/compile.rake
DELETED
data/task/python.rake
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
# NOTE: pre_build/pre_install directories are created by pip.
|
2
|
-
|
3
|
-
directory 'core/site-packages/pre_install' do |task|
|
4
|
-
sh "pip install --requirement=pre_install_requirements.txt " \
|
5
|
-
"--target=#{task.name} --ignore-installed"
|
6
|
-
end
|
7
|
-
|
8
|
-
namespace :python do
|
9
|
-
desc 'Installs Python packages in a local directory'
|
10
|
-
task :compile => ['core/site-packages/pre_install']
|
11
|
-
end
|