opener-property-tagger 3.0.2 → 3.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +0 -7
- data/lib/opener/property_tagger/processor.rb +227 -0
- data/lib/opener/property_tagger/version.rb +1 -1
- data/lib/opener/property_tagger.rb +11 -41
- data/opener-property-tagger.gemspec +2 -8
- data/task/test.rake +1 -1
- metadata +60 -82
- data/core/extract_aspects.py +0 -18
- data/core/hotel_property_tagger_nl_en.py +0 -138
- data/ext/hack/Rakefile +0 -8
- data/pre_install_requirements.txt +0 -1
- data/task/compile.rake +0 -2
- data/task/python.rake +0 -11
- data/task/requirements.rake +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6bd138d2aeb528bf87f83fde2af933ca3ebce6dd
|
4
|
+
data.tar.gz: e0e64e5f709effb0e2671b08790ee5c5e4afb5b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7228f65dde150175167d2ac0f72cac888f8649467be2bd983ba28d1d9738222f20aa8bec7678813f4200756529558159764b4edfa420c2d43d2aee9593f9dccc
|
7
|
+
data.tar.gz: 0303f61b4f14aedc1c37c4378f3a030d602280b261f60013426ec3d9d986356dabfebbbf6f7a665f04fd9f7b65ac0901a67567416eabea5725db96699b1ee67c
|
data/README.md
CHANGED
@@ -118,8 +118,6 @@ At least you need the following system setup:
|
|
118
118
|
### Depenencies for normal use:
|
119
119
|
|
120
120
|
* Ruby 1.9.3 or newer
|
121
|
-
* Python 2.6
|
122
|
-
* lxml installed
|
123
121
|
* libarchive (for running the tests and such), on Debian/Ubuntu based systems
|
124
122
|
this can be installed using `sudo apt-get install libarchive-dev`
|
125
123
|
|
@@ -137,11 +135,6 @@ is the word or span of words (in this case use whitespaces), then the part of
|
|
137
135
|
speech (which actually it is not use, you can include a dummy label) and
|
138
136
|
finally the aspect class associated with the word.
|
139
137
|
|
140
|
-
## The Core
|
141
|
-
|
142
|
-
The component is a fat wrapper around the actual language technology core. You
|
143
|
-
can find the core technolies (python) in the `/core` directory.
|
144
|
-
|
145
138
|
## Where to go from here
|
146
139
|
|
147
140
|
* [Check the project website](http://opener-project.github.io)
|
@@ -0,0 +1,227 @@
|
|
1
|
+
module Opener
|
2
|
+
class PropertyTagger
|
3
|
+
##
|
4
|
+
# Class that applies property tagging to a given input KAF file.
|
5
|
+
#
|
6
|
+
class Processor
|
7
|
+
attr_accessor :document, :aspects_path, :language, :aspects, :terms,
|
8
|
+
:timestamp
|
9
|
+
|
10
|
+
def initialize(file, aspects_path, timestamp = true)
|
11
|
+
@document = Oga.parse_xml(file)
|
12
|
+
@aspects_path = aspects_path
|
13
|
+
@timestamp = timestamp
|
14
|
+
|
15
|
+
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
16
|
+
end
|
17
|
+
|
18
|
+
##
|
19
|
+
# Processes the input and returns the new KAF output.
|
20
|
+
# @return [String]
|
21
|
+
#
|
22
|
+
def process
|
23
|
+
@language = get_language
|
24
|
+
@aspects = load_aspects
|
25
|
+
@terms = get_terms
|
26
|
+
|
27
|
+
existing_aspects = extract_aspects
|
28
|
+
|
29
|
+
add_features_layer
|
30
|
+
add_properties_layer
|
31
|
+
|
32
|
+
index = 1
|
33
|
+
|
34
|
+
existing_aspects.each_pair do |key,value|
|
35
|
+
add_property(key, value, index)
|
36
|
+
index += 1
|
37
|
+
end
|
38
|
+
|
39
|
+
add_linguistic_processor
|
40
|
+
|
41
|
+
return pretty_print(document)
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# Loads the aspects from the txt file
|
46
|
+
# @return [Hash]
|
47
|
+
#
|
48
|
+
def load_aspects
|
49
|
+
aspects_hash = {}
|
50
|
+
|
51
|
+
File.foreach(aspects_file) do |line|
|
52
|
+
lemma, pos, aspect = line.gsub("\n", "").split("\t")
|
53
|
+
|
54
|
+
aspects_hash[lemma.to_sym] = [] unless aspects_hash[lemma.to_sym]
|
55
|
+
aspects_hash[lemma.to_sym] << aspect
|
56
|
+
end
|
57
|
+
|
58
|
+
return aspects_hash
|
59
|
+
end
|
60
|
+
|
61
|
+
##
|
62
|
+
# Get the language of the input file.
|
63
|
+
# @return [String]
|
64
|
+
#
|
65
|
+
def get_language
|
66
|
+
document.at_xpath('KAF').get('xml:lang')
|
67
|
+
end
|
68
|
+
|
69
|
+
##
|
70
|
+
# Get the terms from the input file
|
71
|
+
# @return [Hash]
|
72
|
+
#
|
73
|
+
def get_terms
|
74
|
+
terms_hash = {}
|
75
|
+
|
76
|
+
document.xpath('KAF/terms/term').each do |term|
|
77
|
+
terms_hash[term.get('tid').to_sym] = term.get('lemma')
|
78
|
+
end
|
79
|
+
|
80
|
+
return terms_hash
|
81
|
+
end
|
82
|
+
|
83
|
+
##
|
84
|
+
# Check which terms belong to an aspect (property)
|
85
|
+
# @return [Hash]
|
86
|
+
#
|
87
|
+
def extract_aspects
|
88
|
+
term_ids = terms.keys
|
89
|
+
lemmas = terms.values
|
90
|
+
|
91
|
+
current_token = 0
|
92
|
+
# Use of n-grams to determine if a unigram (1 lemma) or bigram (2
|
93
|
+
# lemmas) belong to a property.
|
94
|
+
max_ngram = 2
|
95
|
+
|
96
|
+
uniq_aspects = {}
|
97
|
+
|
98
|
+
while current_token < terms.count
|
99
|
+
(0..max_ngram).each do |tam_ngram|
|
100
|
+
if current_token + tam_ngram <= terms.count
|
101
|
+
ngram = lemmas[current_token..current_token+tam_ngram].join(" ").downcase
|
102
|
+
if aspects[ngram.to_sym]
|
103
|
+
properties = aspects[ngram.to_sym]
|
104
|
+
ids = term_ids[current_token..current_token+tam_ngram]
|
105
|
+
properties.uniq.reject{|p| p.gsub(" ", "").empty?}.each do |property|
|
106
|
+
uniq_aspects[property.to_sym] = [] unless uniq_aspects[property.to_sym]
|
107
|
+
uniq_aspects[property.to_sym] << [ids,ngram]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
current_token += 1
|
113
|
+
end
|
114
|
+
|
115
|
+
return Hash[uniq_aspects.sort]
|
116
|
+
end
|
117
|
+
|
118
|
+
##
|
119
|
+
# Remove the features layer from the KAF file if it exists and add a new
|
120
|
+
# one.
|
121
|
+
def add_features_layer
|
122
|
+
existing = document.at_xpath('KAF/features')
|
123
|
+
|
124
|
+
existing.remove if existing
|
125
|
+
|
126
|
+
new_node('features', 'KAF')
|
127
|
+
end
|
128
|
+
|
129
|
+
##
|
130
|
+
# Add the properties layer as a child to the features layer.
|
131
|
+
def add_properties_layer
|
132
|
+
new_node("properties", "KAF/features")
|
133
|
+
end
|
134
|
+
|
135
|
+
def add_property(key, value, index)
|
136
|
+
property_node = new_node("property", "KAF/features/properties")
|
137
|
+
|
138
|
+
property_node.set('lemma', key.to_s)
|
139
|
+
property_node.set('pid', "p#{index.to_s}")
|
140
|
+
|
141
|
+
references_node = new_node("references", property_node)
|
142
|
+
|
143
|
+
value.uniq.each do |v|
|
144
|
+
comment = Oga::XML::Comment.new(:text => v.last)
|
145
|
+
|
146
|
+
references_node.children << comment
|
147
|
+
|
148
|
+
span_node = new_node("span", references_node)
|
149
|
+
|
150
|
+
v.first.each do |val|
|
151
|
+
target_node = new_node("target", span_node)
|
152
|
+
|
153
|
+
target_node.set('id', val.to_s)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def add_linguistic_processor
|
159
|
+
description = 'VUA property tagger'
|
160
|
+
last_edited = '16jan2015'
|
161
|
+
version = '2.0'
|
162
|
+
|
163
|
+
node = new_node('linguisticProcessors', 'KAF/kafHeader')
|
164
|
+
node.set('layer', 'features')
|
165
|
+
|
166
|
+
lp_node = new_node('lp', node)
|
167
|
+
|
168
|
+
lp_node.set('version', "#{last_edited}-#{version}")
|
169
|
+
lp_node.set('name', description)
|
170
|
+
|
171
|
+
if timestamp
|
172
|
+
format = '%Y-%m-%dT%H:%M:%S%Z'
|
173
|
+
|
174
|
+
lp_node.set('timestamp', Time.now.strftime(format))
|
175
|
+
else
|
176
|
+
lp_node.set('timestamp', '*')
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
##
|
181
|
+
# Format the output document properly.
|
182
|
+
#
|
183
|
+
# TODO: this should be handled by Oga in a nice way.
|
184
|
+
#
|
185
|
+
# @return [String]
|
186
|
+
#
|
187
|
+
def pretty_print(document)
|
188
|
+
doc = REXML::Document.new document.to_xml
|
189
|
+
doc.context[:attribute_quote] = :quote
|
190
|
+
out = ""
|
191
|
+
formatter = REXML::Formatters::Pretty.new
|
192
|
+
formatter.compact = true
|
193
|
+
formatter.write(doc, out)
|
194
|
+
|
195
|
+
return out.strip
|
196
|
+
end
|
197
|
+
|
198
|
+
protected
|
199
|
+
|
200
|
+
def new_node(tag, parent)
|
201
|
+
if parent.is_a?(String)
|
202
|
+
parent_node = document.at_xpath(parent)
|
203
|
+
else
|
204
|
+
parent_node = parent
|
205
|
+
end
|
206
|
+
|
207
|
+
node = Oga::XML::Element.new(:name => tag)
|
208
|
+
|
209
|
+
parent_node.children << node
|
210
|
+
|
211
|
+
return node
|
212
|
+
end
|
213
|
+
|
214
|
+
##
|
215
|
+
# Check if input is a KAF file.
|
216
|
+
# @return [Boolean]
|
217
|
+
#
|
218
|
+
def is_kaf?
|
219
|
+
return !!document.at_xpath('KAF')
|
220
|
+
end
|
221
|
+
|
222
|
+
def aspects_file
|
223
|
+
return File.expand_path("#{aspects_path}/#{language}.txt", __FILE__)
|
224
|
+
end
|
225
|
+
end # Processor
|
226
|
+
end # PropertyTagger
|
227
|
+
end # Opener
|
@@ -1,8 +1,13 @@
|
|
1
1
|
require 'open3'
|
2
2
|
require 'slop'
|
3
|
+
require 'oga'
|
4
|
+
|
5
|
+
require 'rexml/document'
|
6
|
+
require 'rexml/formatters/pretty'
|
3
7
|
|
4
8
|
require_relative 'property_tagger/version'
|
5
9
|
require_relative 'property_tagger/cli'
|
10
|
+
require_relative 'property_tagger/processor'
|
6
11
|
|
7
12
|
module Opener
|
8
13
|
##
|
@@ -28,15 +33,6 @@ module Opener
|
|
28
33
|
@options = options
|
29
34
|
end
|
30
35
|
|
31
|
-
##
|
32
|
-
# Returns a String containing the command to use for executing the kernel.
|
33
|
-
#
|
34
|
-
# @return [String]
|
35
|
-
#
|
36
|
-
def command
|
37
|
-
return "python -E #{kernel} #{args.join(' ')} --path #{path}"
|
38
|
-
end
|
39
|
-
|
40
36
|
##
|
41
37
|
# Get the resource path for the lexicon files, defaults to an ENV variable
|
42
38
|
#
|
@@ -50,7 +46,7 @@ module Opener
|
|
50
46
|
raise ArgumentError, 'No lexicon path provided'
|
51
47
|
end
|
52
48
|
|
53
|
-
return path
|
49
|
+
return File.expand_path(path)
|
54
50
|
end
|
55
51
|
|
56
52
|
##
|
@@ -61,41 +57,15 @@ module Opener
|
|
61
57
|
# @return [Array]
|
62
58
|
#
|
63
59
|
def run(input)
|
64
|
-
|
65
|
-
|
66
|
-
raise stderr unless process.success?
|
60
|
+
output = process(input)
|
67
61
|
|
68
|
-
return
|
62
|
+
return output
|
69
63
|
end
|
70
64
|
|
71
65
|
protected
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
# this is a workaround
|
76
|
-
#
|
77
|
-
def capture(input)
|
78
|
-
Open3.popen3(*command.split(" ")) {|i, o, e, t|
|
79
|
-
out_reader = Thread.new { o.read }
|
80
|
-
err_reader = Thread.new { e.read }
|
81
|
-
i.write input
|
82
|
-
i.close
|
83
|
-
[out_reader.value, err_reader.value, t.value]
|
84
|
-
}
|
85
|
-
end
|
86
|
-
|
87
|
-
##
|
88
|
-
# @return [String]
|
89
|
-
#
|
90
|
-
def core_dir
|
91
|
-
return File.expand_path('../../../core', __FILE__)
|
92
|
-
end
|
93
|
-
|
94
|
-
##
|
95
|
-
# @return [String]
|
96
|
-
#
|
97
|
-
def kernel
|
98
|
-
return File.join(core_dir, 'hotel_property_tagger_nl_en.py')
|
66
|
+
def process(input)
|
67
|
+
processor = Opener::PropertyTagger::Processor.new(input, path, !args.include?("--no-time"))
|
68
|
+
return processor.process
|
99
69
|
end
|
100
70
|
end # PolarityTagger
|
101
71
|
end # Opener
|
@@ -7,20 +7,15 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.summary = 'Property tagger for hotels in Dutch and English.'
|
8
8
|
gem.description = gem.summary
|
9
9
|
gem.homepage = 'http://opener-project.github.com/'
|
10
|
-
gem.extensions = ['ext/hack/Rakefile']
|
11
10
|
|
12
11
|
gem.license = 'Apache 2.0'
|
13
12
|
|
14
13
|
gem.required_ruby_version = '>= 1.9.2'
|
15
14
|
|
16
15
|
gem.files = Dir.glob([
|
17
|
-
'core/data/**/*',
|
18
|
-
'core/*.py',
|
19
|
-
'ext/**/*',
|
20
16
|
'lib/**/*',
|
21
17
|
'config.ru',
|
22
18
|
'*.gemspec',
|
23
|
-
'*_requirements.txt',
|
24
19
|
'README.md',
|
25
20
|
'LICENSE.txt',
|
26
21
|
'exec/**/*',
|
@@ -33,10 +28,9 @@ Gem::Specification.new do |gem|
|
|
33
28
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
34
29
|
gem.add_dependency 'opener-core', '~> 2.2'
|
35
30
|
|
36
|
-
gem.add_dependency '
|
37
|
-
gem.add_dependency 'rake'
|
38
|
-
gem.add_dependency 'cliver'
|
31
|
+
gem.add_dependency 'oga'
|
39
32
|
|
40
33
|
gem.add_development_dependency 'rspec', '~> 3.0'
|
41
34
|
gem.add_development_dependency 'cucumber'
|
35
|
+
gem.add_development_dependency 'rake'
|
42
36
|
end
|
data/task/test.rake
CHANGED
metadata
CHANGED
@@ -1,184 +1,162 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '2.2'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
15
|
version_requirements: !ruby/object:Gem::Requirement
|
23
16
|
requirements:
|
24
|
-
- -
|
17
|
+
- - ~>
|
25
18
|
- !ruby/object:Gem::Version
|
26
19
|
version: '2.2'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: opener-webservice
|
29
20
|
requirement: !ruby/object:Gem::Requirement
|
30
21
|
requirements:
|
31
|
-
- -
|
22
|
+
- - ~>
|
32
23
|
- !ruby/object:Gem::Version
|
33
|
-
version: '2.
|
34
|
-
type: :runtime
|
24
|
+
version: '2.2'
|
35
25
|
prerelease: false
|
26
|
+
type: :runtime
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: opener-webservice
|
36
29
|
version_requirements: !ruby/object:Gem::Requirement
|
37
30
|
requirements:
|
38
|
-
- -
|
31
|
+
- - ~>
|
39
32
|
- !ruby/object:Gem::Version
|
40
33
|
version: '2.1'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: opener-core
|
43
34
|
requirement: !ruby/object:Gem::Requirement
|
44
35
|
requirements:
|
45
|
-
- -
|
36
|
+
- - ~>
|
46
37
|
- !ruby/object:Gem::Version
|
47
|
-
version: '2.
|
48
|
-
type: :runtime
|
38
|
+
version: '2.1'
|
49
39
|
prerelease: false
|
40
|
+
type: :runtime
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: opener-core
|
50
43
|
version_requirements: !ruby/object:Gem::Requirement
|
51
44
|
requirements:
|
52
|
-
- -
|
45
|
+
- - ~>
|
53
46
|
- !ruby/object:Gem::Version
|
54
47
|
version: '2.2'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: nokogiri
|
57
48
|
requirement: !ruby/object:Gem::Requirement
|
58
49
|
requirements:
|
59
|
-
- -
|
50
|
+
- - ~>
|
60
51
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
62
|
-
type: :runtime
|
52
|
+
version: '2.2'
|
63
53
|
prerelease: false
|
54
|
+
type: :runtime
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: oga
|
64
57
|
version_requirements: !ruby/object:Gem::Requirement
|
65
58
|
requirements:
|
66
|
-
- -
|
59
|
+
- - '>='
|
67
60
|
- !ruby/object:Gem::Version
|
68
61
|
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: rake
|
71
62
|
requirement: !ruby/object:Gem::Requirement
|
72
63
|
requirements:
|
73
|
-
- -
|
64
|
+
- - '>='
|
74
65
|
- !ruby/object:Gem::Version
|
75
66
|
version: '0'
|
76
|
-
type: :runtime
|
77
67
|
prerelease: false
|
68
|
+
type: :runtime
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
78
71
|
version_requirements: !ruby/object:Gem::Requirement
|
79
72
|
requirements:
|
80
|
-
- -
|
73
|
+
- - ~>
|
81
74
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: cliver
|
75
|
+
version: '3.0'
|
85
76
|
requirement: !ruby/object:Gem::Requirement
|
86
77
|
requirements:
|
87
|
-
- -
|
78
|
+
- - ~>
|
88
79
|
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :runtime
|
80
|
+
version: '3.0'
|
91
81
|
prerelease: false
|
82
|
+
type: :development
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: cucumber
|
92
85
|
version_requirements: !ruby/object:Gem::Requirement
|
93
86
|
requirements:
|
94
|
-
- -
|
87
|
+
- - '>='
|
95
88
|
- !ruby/object:Gem::Version
|
96
89
|
version: '0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: rspec
|
99
90
|
requirement: !ruby/object:Gem::Requirement
|
100
91
|
requirements:
|
101
|
-
- -
|
92
|
+
- - '>='
|
102
93
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
104
|
-
type: :development
|
94
|
+
version: '0'
|
105
95
|
prerelease: false
|
96
|
+
type: :development
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rake
|
106
99
|
version_requirements: !ruby/object:Gem::Requirement
|
107
100
|
requirements:
|
108
|
-
- -
|
101
|
+
- - '>='
|
109
102
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
111
|
-
- !ruby/object:Gem::Dependency
|
112
|
-
name: cucumber
|
103
|
+
version: '0'
|
113
104
|
requirement: !ruby/object:Gem::Requirement
|
114
105
|
requirements:
|
115
|
-
- -
|
106
|
+
- - '>='
|
116
107
|
- !ruby/object:Gem::Version
|
117
108
|
version: '0'
|
118
|
-
type: :development
|
119
109
|
prerelease: false
|
120
|
-
|
121
|
-
requirements:
|
122
|
-
- - ">="
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: '0'
|
110
|
+
type: :development
|
125
111
|
description: Property tagger for hotels in Dutch and English.
|
126
|
-
email:
|
112
|
+
email:
|
127
113
|
executables:
|
128
114
|
- property-tagger
|
129
115
|
- property-tagger-daemon
|
130
116
|
- property-tagger-server
|
131
|
-
extensions:
|
132
|
-
- ext/hack/Rakefile
|
117
|
+
extensions: []
|
133
118
|
extra_rdoc_files: []
|
134
119
|
files:
|
135
|
-
- LICENSE.txt
|
136
|
-
- README.md
|
137
|
-
- bin/property-tagger
|
138
|
-
- bin/property-tagger-daemon
|
139
|
-
- bin/property-tagger-server
|
140
|
-
- config.ru
|
141
|
-
- core/extract_aspects.py
|
142
|
-
- core/hotel_property_tagger_nl_en.py
|
143
|
-
- exec/property-tagger.rb
|
144
|
-
- ext/hack/Rakefile
|
145
120
|
- lib/opener/property_tagger.rb
|
146
121
|
- lib/opener/property_tagger/cli.rb
|
147
|
-
- lib/opener/property_tagger/
|
122
|
+
- lib/opener/property_tagger/processor.rb
|
148
123
|
- lib/opener/property_tagger/server.rb
|
149
124
|
- lib/opener/property_tagger/version.rb
|
125
|
+
- lib/opener/property_tagger/public/markdown.css
|
150
126
|
- lib/opener/property_tagger/views/index.erb
|
151
127
|
- lib/opener/property_tagger/views/result.erb
|
128
|
+
- config.ru
|
152
129
|
- opener-property-tagger.gemspec
|
153
|
-
-
|
154
|
-
-
|
130
|
+
- README.md
|
131
|
+
- LICENSE.txt
|
132
|
+
- exec/property-tagger.rb
|
155
133
|
- task/lexicons.rake
|
156
|
-
- task/python.rake
|
157
|
-
- task/requirements.rake
|
158
134
|
- task/test.rake
|
135
|
+
- bin/property-tagger
|
136
|
+
- bin/property-tagger-daemon
|
137
|
+
- bin/property-tagger-server
|
159
138
|
homepage: http://opener-project.github.com/
|
160
139
|
licenses:
|
161
140
|
- Apache 2.0
|
162
141
|
metadata: {}
|
163
|
-
post_install_message:
|
142
|
+
post_install_message:
|
164
143
|
rdoc_options: []
|
165
144
|
require_paths:
|
166
145
|
- lib
|
167
146
|
required_ruby_version: !ruby/object:Gem::Requirement
|
168
147
|
requirements:
|
169
|
-
- -
|
148
|
+
- - '>='
|
170
149
|
- !ruby/object:Gem::Version
|
171
150
|
version: 1.9.2
|
172
151
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
173
152
|
requirements:
|
174
|
-
- -
|
153
|
+
- - '>='
|
175
154
|
- !ruby/object:Gem::Version
|
176
155
|
version: '0'
|
177
156
|
requirements: []
|
178
|
-
rubyforge_project:
|
179
|
-
rubygems_version: 2.
|
180
|
-
signing_key:
|
157
|
+
rubyforge_project:
|
158
|
+
rubygems_version: 2.1.9
|
159
|
+
signing_key:
|
181
160
|
specification_version: 4
|
182
161
|
summary: Property tagger for hotels in Dutch and English.
|
183
162
|
test_files: []
|
184
|
-
has_rdoc:
|
data/core/extract_aspects.py
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
|
3
|
-
from lxml import etree
|
4
|
-
import sys
|
5
|
-
#filename='/Users/ruben/CODE/VU-sentiment-lexicon-xml/VUSentimentLexicon/EN-lexicon/Sentiment-English-HotelDomain.xml'
|
6
|
-
|
7
|
-
root = etree.parse(sys.stdin).getroot()
|
8
|
-
|
9
|
-
for element in root.findall('Lexicon/LexicalEntry'):
|
10
|
-
ele_lemma = element.findall('Lemma')[0]
|
11
|
-
ele_domain = element.findall('Sense/Domain')[0]
|
12
|
-
pos = element.get('partOfSpeech','unknown_pos')
|
13
|
-
if ele_lemma is not None and ele_domain is not None:
|
14
|
-
lemma = ele_lemma.get('writtenForm','').lower()
|
15
|
-
aspect = ele_domain.get('aspect','').lower()
|
16
|
-
if lemma!='' and aspect!='':
|
17
|
-
print lemma.encode('utf-8')+'\t'+pos.encode('utf-8')+'\t'+aspect.encode('utf-8')
|
18
|
-
|
@@ -1,138 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
|
3
|
-
import sys
|
4
|
-
import argparse
|
5
|
-
|
6
|
-
import codecs
|
7
|
-
import os
|
8
|
-
|
9
|
-
this_folder = os.path.dirname(os.path.realpath(__file__))
|
10
|
-
|
11
|
-
# This updates the load path to ensure that the local site-packages directory
|
12
|
-
# can be used to load packages (e.g. a locally installed copy of lxml).
|
13
|
-
sys.path.append(os.path.join(this_folder, 'site-packages/pre_install'))
|
14
|
-
|
15
|
-
from VUKafParserPy import KafParser
|
16
|
-
from lxml import etree
|
17
|
-
from collections import defaultdict
|
18
|
-
|
19
|
-
__desc='VUA property tagger'
|
20
|
-
__last_edited='20may2014'
|
21
|
-
__version='1.0'
|
22
|
-
|
23
|
-
###
|
24
|
-
__module_dir = os.path.dirname(__file__)
|
25
|
-
max_ngram = 1
|
26
|
-
verbose = False
|
27
|
-
##
|
28
|
-
|
29
|
-
|
30
|
-
########################################
|
31
|
-
## Format of the file:
|
32
|
-
#lemma pos aspect
|
33
|
-
#lemma pos aspect
|
34
|
-
########################################
|
35
|
-
def loadAspects(my_lang,this_file=None):
|
36
|
-
my_aspects = {}
|
37
|
-
if this_file is not None:
|
38
|
-
aspects_filename = this_file
|
39
|
-
else:
|
40
|
-
filename = "{0}.txt".format(my_lang)
|
41
|
-
print>>sys.stderr, "filename thingy",filename
|
42
|
-
print>>sys.stderr, "path thingy",arguments.path
|
43
|
-
aspects_filename = os.path.join(arguments.path,filename)
|
44
|
-
|
45
|
-
if not os.path.exists(aspects_filename):
|
46
|
-
print>>sys.stderr,'ERROR: file with aspects for the language',my_lang,'not found in',aspects_filename
|
47
|
-
else:
|
48
|
-
fic = codecs.open(aspects_filename,'r','utf-8')
|
49
|
-
for line in fic:
|
50
|
-
fields = line.strip().split('\t')
|
51
|
-
if len(fields) == 3:
|
52
|
-
lemma,pos,aspect = fields
|
53
|
-
my_aspects[lemma] = aspect
|
54
|
-
fic.close()
|
55
|
-
return aspects_filename, my_aspects
|
56
|
-
########################################
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
###### MAIN ########
|
61
|
-
|
62
|
-
argument_parser = argparse.ArgumentParser(description='Tags a text with polarities at lemma level')
|
63
|
-
argument_parser.add_argument("--no-time",action="store_false", default=True, dest="my_time_stamp",help="For not including timestamp in header")
|
64
|
-
argument_parser.add_argument("--lexicon", action="store", default=None, dest="lexicon", help="Force to use this lexicon")
|
65
|
-
argument_parser.add_argument("--path", action="store", default=None, dest="path", help="Set the path where the property aspects are found.")
|
66
|
-
|
67
|
-
arguments = argument_parser.parse_args()
|
68
|
-
|
69
|
-
if not sys.stdin.isatty():
|
70
|
-
## READING FROM A PIPE
|
71
|
-
pass
|
72
|
-
else:
|
73
|
-
print>>sys.stderr,'Input stream required.'
|
74
|
-
print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
|
75
|
-
print>>sys.stderr,sys.argv[0]+' -h for help'
|
76
|
-
sys.exit(-1)
|
77
|
-
|
78
|
-
|
79
|
-
## Load the tree and the list of terms with the id
|
80
|
-
my_data = []
|
81
|
-
try:
|
82
|
-
my_kaf_tree = KafParser(sys.stdin)
|
83
|
-
except Exception as e:
|
84
|
-
print>>sys.stdout,'Error parsing input. Input is required to be KAF'
|
85
|
-
print>>sys.stdout,str(e)
|
86
|
-
sys.exit(2)
|
87
|
-
|
88
|
-
|
89
|
-
## Get language from the KAF file
|
90
|
-
my_lang = my_kaf_tree.getLanguage()
|
91
|
-
|
92
|
-
my_aspects_filename = my_aspects = None
|
93
|
-
if arguments.lexicon is None:
|
94
|
-
if my_lang not in ['nl','en','de','fr','it','es']:
|
95
|
-
print>>sys.stdout,'Error in the language specified in your KAF. The language is ',my_lang,' and possible values for this module '
|
96
|
-
print>>sys.stdout,'are nl for Dutch ,en for English, es Spanish, fr French, it Italian or de German'
|
97
|
-
sys.exit(1)
|
98
|
-
|
99
|
-
my_aspects_filename, my_aspects = loadAspects(my_lang)
|
100
|
-
else:
|
101
|
-
my_aspects_filename, my_aspects = loadAspects(my_lang,this_file=arguments.lexicon)
|
102
|
-
|
103
|
-
if verbose:
|
104
|
-
print>>sys.stderr,'Loaded ',len(my_aspects),'aspects from',my_aspects_filename
|
105
|
-
|
106
|
-
|
107
|
-
for term in my_kaf_tree.getTerms():
|
108
|
-
my_data.append((term.getLemma(),term.getId()))
|
109
|
-
if verbose: print>>sys.stderr,'Number of terms in the kaf file:',len(my_data)
|
110
|
-
|
111
|
-
|
112
|
-
current_token = found = 0
|
113
|
-
uniq_aspects = defaultdict(list)
|
114
|
-
while current_token < len(my_data):
|
115
|
-
for tam_ngram in range(1,max_ngram+1):
|
116
|
-
# Build an n-gram of size tam_ngram and beginning in current_token
|
117
|
-
if current_token + tam_ngram <= len(my_data):
|
118
|
-
ngram = ' '.join(lemma for lemma,_ in my_data[current_token:current_token+tam_ngram])
|
119
|
-
aspect = my_aspects.get(ngram.lower(),None)
|
120
|
-
if aspect is not None:
|
121
|
-
list_of_ids = [id for _,id in my_data[current_token:current_token+tam_ngram]]
|
122
|
-
uniq_aspects[aspect].append((list_of_ids,ngram))
|
123
|
-
current_token += 1
|
124
|
-
|
125
|
-
|
126
|
-
## Code for generating the propery layer included in the Parser
|
127
|
-
for aspect, list_of_lists in uniq_aspects.items():
|
128
|
-
for list_of_ids, str_text in list_of_lists:
|
129
|
-
my_kaf_tree.add_property(aspect,list_of_ids,str_text)
|
130
|
-
|
131
|
-
my_kaf_tree.addLinguisticProcessor(__desc,__last_edited+'_'+__version,'features', arguments.my_time_stamp)
|
132
|
-
my_kaf_tree.saveToFile(sys.stdout)
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
data/ext/hack/Rakefile
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
import File.expand_path('../../../task/requirements.rake', __FILE__)
|
2
|
-
import File.expand_path('../../../task/python.rake', __FILE__)
|
3
|
-
|
4
|
-
task :default => :requirements do
|
5
|
-
Dir.chdir(File.expand_path('../../../', __FILE__)) do
|
6
|
-
Rake::Task['core/site-packages/pre_install'].invoke
|
7
|
-
end
|
8
|
-
end
|
@@ -1 +0,0 @@
|
|
1
|
-
https://github.com/opener-project/VU-kaf-parser/archive/master.zip#egg=VUKafParserPy
|
data/task/compile.rake
DELETED
data/task/python.rake
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
# NOTE: pre_build/pre_install directories are created by pip.
|
2
|
-
|
3
|
-
directory 'core/site-packages/pre_install' do |task|
|
4
|
-
sh "pip install --requirement=pre_install_requirements.txt " \
|
5
|
-
"--target=#{task.name} --ignore-installed"
|
6
|
-
end
|
7
|
-
|
8
|
-
namespace :python do
|
9
|
-
desc 'Installs Python packages in a local directory'
|
10
|
-
task :compile => ['core/site-packages/pre_install']
|
11
|
-
end
|