opener-opinion-detector-basic 2.0.7 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -5
- data/lib/opener/opinion_detector_basic.rb +12 -65
- data/lib/opener/opinion_detector_basic/opinion.rb +171 -0
- data/lib/opener/opinion_detector_basic/processor.rb +329 -0
- data/lib/opener/opinion_detector_basic/term.rb +160 -0
- data/lib/opener/opinion_detector_basic/version.rb +1 -1
- data/opener-opinion-detector-basic.gemspec +5 -10
- metadata +24 -39
- data/core/opinion_detector_basic_multi.py +0 -512
- data/ext/hack/Rakefile +0 -8
- data/pre_install_requirements.txt +0 -1
@@ -0,0 +1,160 @@
|
|
1
|
+
module Opener
|
2
|
+
class OpinionDetectorBasic
|
3
|
+
class Term
|
4
|
+
attr_reader :node, :sentence, :is_conjunction
|
5
|
+
attr_accessor :use, :accumulated_strength, :list_ids
|
6
|
+
|
7
|
+
def initialize(node, document, language)
|
8
|
+
@node = node
|
9
|
+
@sentence = get_sentence(document)
|
10
|
+
@use = true
|
11
|
+
@accumulated_strength = strength
|
12
|
+
@list_ids = [id]
|
13
|
+
@is_conjunction = is_conjunction?(language)
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the term id.
|
18
|
+
#
|
19
|
+
# @return [String]
|
20
|
+
#
|
21
|
+
def id
|
22
|
+
@id ||= node.get('tid')
|
23
|
+
end
|
24
|
+
|
25
|
+
##
|
26
|
+
# Returns the lemma of the term.
|
27
|
+
#
|
28
|
+
# @return [String]
|
29
|
+
#
|
30
|
+
def lemma
|
31
|
+
@lemma ||= node.get('lemma')
|
32
|
+
end
|
33
|
+
|
34
|
+
##
|
35
|
+
# Returns the part of speech of the term.
|
36
|
+
#
|
37
|
+
# @return [String]
|
38
|
+
#
|
39
|
+
def pos
|
40
|
+
@pos ||= node.get('pos')
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# Returns the sentiment modifier type if it exists.
|
45
|
+
#
|
46
|
+
# @return [String|NilClass]
|
47
|
+
#
|
48
|
+
def sentiment_modifier
|
49
|
+
@sentiment_modifier ||= if sentiment = node.xpath('sentiment').first
|
50
|
+
sentiment.get('sentiment_modifier')
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# Returns the polarity of the term if it exists.
|
56
|
+
#
|
57
|
+
# @return [String|NilClass]
|
58
|
+
#
|
59
|
+
def polarity
|
60
|
+
@polarity ||= if sentiment = node.xpath('sentiment').first
|
61
|
+
sentiment.get('polarity')
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Returns the actual word ids that construct the lemma.
|
67
|
+
#
|
68
|
+
# @return [Array]
|
69
|
+
#
|
70
|
+
def target_ids
|
71
|
+
@target_ids ||= node.xpath('span/target').map {|target| target.get('id')}
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Returns the strength of the term depending on its type.
|
76
|
+
#
|
77
|
+
# @return [Integer]
|
78
|
+
#
|
79
|
+
def strength
|
80
|
+
if polarity == "positive"
|
81
|
+
return 1
|
82
|
+
elsif polarity == "negative"
|
83
|
+
return -1
|
84
|
+
end
|
85
|
+
|
86
|
+
if is_intensifier?
|
87
|
+
return 2
|
88
|
+
elsif is_shifter?
|
89
|
+
return -1
|
90
|
+
end
|
91
|
+
|
92
|
+
return 0
|
93
|
+
end
|
94
|
+
|
95
|
+
##
|
96
|
+
# Returns the sentence id that the term belongs to in the document.
|
97
|
+
#
|
98
|
+
# @return [String]
|
99
|
+
#
|
100
|
+
def get_sentence(document)
|
101
|
+
document
|
102
|
+
.xpath("KAF/text/wf[@wid='#{target_ids.first}']")
|
103
|
+
.first
|
104
|
+
.get('sent')
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
# Checks if a term is an intensifier.
|
109
|
+
#
|
110
|
+
# @return [TrueClass|FalseClass]
|
111
|
+
#
|
112
|
+
def is_intensifier?
|
113
|
+
sentiment_modifier == "intensifier"
|
114
|
+
end
|
115
|
+
|
116
|
+
##
|
117
|
+
# Checks if a term is a shifter.
|
118
|
+
#
|
119
|
+
# @return [TrueClass|FalseClass]
|
120
|
+
#
|
121
|
+
def is_shifter?
|
122
|
+
sentiment_modifier == "shifter"
|
123
|
+
end
|
124
|
+
|
125
|
+
##
|
126
|
+
# Checks if a term is an expression.
|
127
|
+
#
|
128
|
+
# @return [TrueClass|FalseClass]
|
129
|
+
#
|
130
|
+
def is_expression?
|
131
|
+
use && !!polarity
|
132
|
+
end
|
133
|
+
|
134
|
+
##
|
135
|
+
# Checks if a term is a conjunction.
|
136
|
+
#
|
137
|
+
# @return [TrueClass|FalseClass]
|
138
|
+
#
|
139
|
+
def is_conjunction?(language)
|
140
|
+
conjunctions[language].include?(lemma)
|
141
|
+
end
|
142
|
+
|
143
|
+
##
|
144
|
+
# Map of conjunctions per language code
|
145
|
+
#
|
146
|
+
# @return [Hash]
|
147
|
+
#
|
148
|
+
def conjunctions
|
149
|
+
{
|
150
|
+
'nl' => [',','en'],
|
151
|
+
'en' => [',','and'],
|
152
|
+
'es' => [',','y','e'],
|
153
|
+
'it' => [',','e','ed'],
|
154
|
+
'de' => [',','und'],
|
155
|
+
'fr' => [',','et']
|
156
|
+
}
|
157
|
+
end
|
158
|
+
end # Term
|
159
|
+
end # OpinionDetectorBasic
|
160
|
+
end # Opener
|
@@ -7,18 +7,14 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.summary = 'Basic Opinion Detector.'
|
8
8
|
gem.description = gem.summary
|
9
9
|
gem.homepage = 'http://opener-project.github.com/'
|
10
|
-
gem.extensions = ['ext/hack/Rakefile']
|
11
10
|
gem.license = 'Apache 2.0'
|
12
11
|
|
13
12
|
gem.required_ruby_version = '>= 1.9.2'
|
14
|
-
|
13
|
+
|
15
14
|
gem.files = Dir.glob([
|
16
|
-
'core/*',
|
17
|
-
'ext/**/*',
|
18
15
|
'lib/**/*',
|
19
16
|
'config.ru',
|
20
17
|
'*.gemspec',
|
21
|
-
'*_requirements.txt',
|
22
18
|
'README.md',
|
23
19
|
'LICENSE.txt',
|
24
20
|
'exec/**/*',
|
@@ -30,12 +26,11 @@ Gem::Specification.new do |gem|
|
|
30
26
|
gem.add_dependency 'opener-daemons', '~> 2.2'
|
31
27
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
32
28
|
gem.add_dependency 'opener-core', '~> 2.2'
|
33
|
-
|
34
|
-
gem.add_dependency '
|
35
|
-
gem.add_dependency 'nokogiri'
|
36
|
-
gem.add_dependency 'cliver'
|
37
|
-
gem.add_dependency 'slop', '~> 3.5'
|
29
|
+
|
30
|
+
gem.add_dependency 'oga'
|
38
31
|
|
39
32
|
gem.add_development_dependency 'rspec', '~> 3.0'
|
40
33
|
gem.add_development_dependency 'cucumber'
|
34
|
+
gem.add_development_dependency 'rake'
|
35
|
+
gem.add_development_dependency 'benchmark-ips', '~> 2.0'
|
41
36
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-opinion-detector-basic
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
@@ -53,7 +53,7 @@ dependencies:
|
|
53
53
|
prerelease: false
|
54
54
|
type: :runtime
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: oga
|
57
57
|
version_requirements: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - '>='
|
@@ -67,21 +67,21 @@ dependencies:
|
|
67
67
|
prerelease: false
|
68
68
|
type: :runtime
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rspec
|
71
71
|
version_requirements: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - ~>
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
75
|
+
version: '3.0'
|
76
76
|
requirement: !ruby/object:Gem::Requirement
|
77
77
|
requirements:
|
78
|
-
- -
|
78
|
+
- - ~>
|
79
79
|
- !ruby/object:Gem::Version
|
80
|
-
version: '0'
|
80
|
+
version: '3.0'
|
81
81
|
prerelease: false
|
82
|
-
type: :
|
82
|
+
type: :development
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: cucumber
|
85
85
|
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - '>='
|
@@ -93,47 +93,33 @@ dependencies:
|
|
93
93
|
- !ruby/object:Gem::Version
|
94
94
|
version: '0'
|
95
95
|
prerelease: false
|
96
|
-
type: :
|
96
|
+
type: :development
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: rake
|
99
99
|
version_requirements: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- -
|
101
|
+
- - '>='
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
103
|
+
version: '0'
|
104
104
|
requirement: !ruby/object:Gem::Requirement
|
105
105
|
requirements:
|
106
|
-
- -
|
106
|
+
- - '>='
|
107
107
|
- !ruby/object:Gem::Version
|
108
|
-
version: '
|
108
|
+
version: '0'
|
109
109
|
prerelease: false
|
110
|
-
type: :
|
110
|
+
type: :development
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
|
-
name:
|
112
|
+
name: benchmark-ips
|
113
113
|
version_requirements: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
115
|
- - ~>
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
117
|
+
version: '2.0'
|
118
118
|
requirement: !ruby/object:Gem::Requirement
|
119
119
|
requirements:
|
120
120
|
- - ~>
|
121
121
|
- !ruby/object:Gem::Version
|
122
|
-
version: '
|
123
|
-
prerelease: false
|
124
|
-
type: :development
|
125
|
-
- !ruby/object:Gem::Dependency
|
126
|
-
name: cucumber
|
127
|
-
version_requirements: !ruby/object:Gem::Requirement
|
128
|
-
requirements:
|
129
|
-
- - '>='
|
130
|
-
- !ruby/object:Gem::Version
|
131
|
-
version: '0'
|
132
|
-
requirement: !ruby/object:Gem::Requirement
|
133
|
-
requirements:
|
134
|
-
- - '>='
|
135
|
-
- !ruby/object:Gem::Version
|
136
|
-
version: '0'
|
122
|
+
version: '2.0'
|
137
123
|
prerelease: false
|
138
124
|
type: :development
|
139
125
|
description: Basic Opinion Detector.
|
@@ -142,21 +128,20 @@ executables:
|
|
142
128
|
- opinion-detector-basic
|
143
129
|
- opinion-detector-basic-daemon
|
144
130
|
- opinion-detector-basic-server
|
145
|
-
extensions:
|
146
|
-
- ext/hack/Rakefile
|
131
|
+
extensions: []
|
147
132
|
extra_rdoc_files: []
|
148
133
|
files:
|
149
|
-
- core/opinion_detector_basic_multi.py
|
150
|
-
- ext/hack/Rakefile
|
151
134
|
- lib/opener/opinion_detector_basic.rb
|
152
135
|
- lib/opener/opinion_detector_basic/cli.rb
|
136
|
+
- lib/opener/opinion_detector_basic/opinion.rb
|
137
|
+
- lib/opener/opinion_detector_basic/processor.rb
|
153
138
|
- lib/opener/opinion_detector_basic/server.rb
|
139
|
+
- lib/opener/opinion_detector_basic/term.rb
|
154
140
|
- lib/opener/opinion_detector_basic/version.rb
|
155
141
|
- lib/opener/opinion_detector_basic/public/markdown.css
|
156
142
|
- lib/opener/opinion_detector_basic/views/index.erb
|
157
143
|
- config.ru
|
158
144
|
- opener-opinion-detector-basic.gemspec
|
159
|
-
- pre_install_requirements.txt
|
160
145
|
- README.md
|
161
146
|
- LICENSE.txt
|
162
147
|
- exec/opinion-detector-basic.rb
|
@@ -1,512 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
|
3
|
-
import sys
|
4
|
-
import getopt
|
5
|
-
import os
|
6
|
-
|
7
|
-
this_folder = os.path.dirname(os.path.realpath(__file__))
|
8
|
-
|
9
|
-
# This updates the load path to ensure that the local site-packages directory
|
10
|
-
# can be used to load packages (e.g. a locally installed copy of lxml).
|
11
|
-
sys.path.append(os.path.join(this_folder, 'site-packages/pre_install'))
|
12
|
-
|
13
|
-
from VUKafParserPy import KafParser
|
14
|
-
from collections import defaultdict
|
15
|
-
import operator
|
16
|
-
import pprint
|
17
|
-
import lxml
|
18
|
-
from lxml import etree
|
19
|
-
import logging
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
def mix_lists(l1,l2):
|
24
|
-
newl=[]
|
25
|
-
min_l = min(len(l1),len(l2))
|
26
|
-
for x in range(min_l):
|
27
|
-
newl.append(l1[x])
|
28
|
-
newl.append(l2[x])
|
29
|
-
|
30
|
-
if len(l1)>len(l2):
|
31
|
-
newl.extend(l1[min_l:])
|
32
|
-
elif len(l2)>len(l1):
|
33
|
-
newl.extend(l2[min_l:])
|
34
|
-
return newl
|
35
|
-
|
36
|
-
|
37
|
-
class OpinionExpression:
|
38
|
-
def __init__(self,spans,sentence,value):
|
39
|
-
self.ids = spans
|
40
|
-
self.sentence = sentence
|
41
|
-
self.value = value
|
42
|
-
self.target_ids = []
|
43
|
-
self.candidates_r=[]
|
44
|
-
self.candidates_l=[]
|
45
|
-
self.holder = []
|
46
|
-
|
47
|
-
def __repr__(self):
|
48
|
-
r='Ids:'+'#'.join(self.ids)+' Sent:'+self.sentence+' Value:'+str(self.value)+ ' Target:'+'#'.join(self.target_ids)+'\n'
|
49
|
-
r+='Right cand: '+str(self.candidates_r)+'\n'
|
50
|
-
r+='Left cand: '+str(self.candidates_l)+'\n'
|
51
|
-
return r
|
52
|
-
|
53
|
-
class MyToken:
|
54
|
-
def __init__(self,id,lemma,pos,polarity,sent_mod,sent):
|
55
|
-
self.id = id
|
56
|
-
self.lemma = lemma
|
57
|
-
self.pos = pos
|
58
|
-
self.polarity = polarity
|
59
|
-
self.sent_mod = sent_mod
|
60
|
-
self.sentence = sent
|
61
|
-
self.use_it = True
|
62
|
-
self.list_ids = [id]
|
63
|
-
self.value = 0
|
64
|
-
|
65
|
-
|
66
|
-
if polarity == 'positive':
|
67
|
-
self.value = 1
|
68
|
-
elif polarity == 'negative':
|
69
|
-
self.value = -1
|
70
|
-
|
71
|
-
if sent_mod == 'intensifier':
|
72
|
-
self.value = 2
|
73
|
-
elif sent_mod == 'shifter':
|
74
|
-
self.value = -1
|
75
|
-
|
76
|
-
|
77
|
-
def isNegator(self):
|
78
|
-
return self.sent_mod == 'shifter'
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
def isIntensifier(self):
|
83
|
-
return self.sent_mod == 'intensifier'
|
84
|
-
|
85
|
-
|
86
|
-
def is_opinion_expression(self):
|
87
|
-
return self.use_it and self.polarity is not None
|
88
|
-
|
89
|
-
|
90
|
-
def __repr__(self):
|
91
|
-
if self.use_it:
|
92
|
-
return self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
|
93
|
-
else:
|
94
|
-
return '\t'+self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
def obtain_opinion_expressions(tokens,lang='nl'):
|
99
|
-
logging.debug(' Obtaining opinion expressions')
|
100
|
-
my_tokens = tokens[:]
|
101
|
-
|
102
|
-
accumulate_several_modifiers = True
|
103
|
-
apply_modifiers = True
|
104
|
-
apply_conjunctions = True
|
105
|
-
|
106
|
-
## Acumulate doble/triple intensifiers or negators
|
107
|
-
if accumulate_several_modifiers:
|
108
|
-
logging.debug(' Accumulating modifiers')
|
109
|
-
t = 0
|
110
|
-
while t < len(my_tokens):
|
111
|
-
if t+1 < len(my_tokens):
|
112
|
-
if (my_tokens[t].isNegator() or my_tokens[t].isIntensifier()) and my_tokens[t+1].isNegator():
|
113
|
-
my_tokens[t+1].value *= my_tokens[t].value
|
114
|
-
my_tokens[t].use_it = False
|
115
|
-
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
116
|
-
logging.debug(' Accumulating '+'-'.join(my_tokens[t+1].list_ids))
|
117
|
-
elif my_tokens[t].isNegator() and my_tokens[t+1].isIntensifier():
|
118
|
-
my_tokens[t+1].value *= -1
|
119
|
-
my_tokens[t].use_it = False
|
120
|
-
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
121
|
-
logging.debug(' Accumulating '+'-'.join(my_tokens[t+1].list_ids))
|
122
|
-
elif my_tokens[t].isIntensifier() and my_tokens[t+1].isIntensifier():
|
123
|
-
if my_tokens[t].value >= 0:
|
124
|
-
my_tokens[t+1].value = my_tokens[t].value + my_tokens[t+1].value
|
125
|
-
else:
|
126
|
-
my_tokens[t+1].value = my_tokens[t].value - my_tokens[t+1].value
|
127
|
-
my_tokens[t].use_it = False
|
128
|
-
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
129
|
-
logging.debug(' Accumulating '+'-'.join(my_tokens[t+1].list_ids))
|
130
|
-
|
131
|
-
t+=1
|
132
|
-
###########################################
|
133
|
-
|
134
|
-
##Apply intensifiers/negators over the next elements
|
135
|
-
if apply_modifiers:
|
136
|
-
logging.debug(' Applying modifiers')
|
137
|
-
t = 0
|
138
|
-
while t < len(my_tokens):
|
139
|
-
if my_tokens[t].use_it and (my_tokens[t].isNegator() or my_tokens[t].isIntensifier()):
|
140
|
-
## Try to modify the next token:
|
141
|
-
if t+1<len(my_tokens):
|
142
|
-
#print 'Score: ',my_tokens[t]
|
143
|
-
my_tokens[t+1].value *= my_tokens[t].value
|
144
|
-
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
145
|
-
my_tokens[t].use_it = False
|
146
|
-
logging.debug(' Applied modifier over '+'-'.join(my_tokens[t+1].list_ids))
|
147
|
-
t += 1
|
148
|
-
###########################################
|
149
|
-
|
150
|
-
if apply_conjunctions:
|
151
|
-
if lang=='nl':
|
152
|
-
concat = [',','en']
|
153
|
-
elif lang=='en':
|
154
|
-
concat = [',','and']
|
155
|
-
elif lang=='es':
|
156
|
-
concat = [',','y','e']
|
157
|
-
elif lang=='it':
|
158
|
-
concat = [',','e','ed']
|
159
|
-
elif lang=='de':
|
160
|
-
concat = [',','und']
|
161
|
-
elif lang == 'fr':
|
162
|
-
concat=[',','et']
|
163
|
-
logging.debug(' Applying conjunctions:'+str(concat))
|
164
|
-
|
165
|
-
|
166
|
-
t = 0
|
167
|
-
while t < len(my_tokens):
|
168
|
-
if my_tokens[t].use_it and my_tokens[t].value!=0: ## Find the first one
|
169
|
-
#print 'FOUND ',my_tokens[t]
|
170
|
-
logging.debug(' Found token '+str(my_tokens[t]))
|
171
|
-
list_aux = my_tokens[t].list_ids
|
172
|
-
used = [t]
|
173
|
-
value_aux = my_tokens[t].value
|
174
|
-
my_tokens[t].use_it = False
|
175
|
-
#print 'Modified',my_tokens[t]
|
176
|
-
|
177
|
-
x = t+1
|
178
|
-
while True:
|
179
|
-
if x>=len(my_tokens):
|
180
|
-
break
|
181
|
-
|
182
|
-
if my_tokens[x].lemma in concat:
|
183
|
-
## list_aux += my_tokens[x].list_ids Dont use it as part of the OE
|
184
|
-
my_tokens[x].use_it = False
|
185
|
-
x+=1
|
186
|
-
elif (my_tokens[x].use_it and my_tokens[x].value!=0):
|
187
|
-
#print '\Also ',my_tokens[x]
|
188
|
-
logging.debug(' Found token '+str(my_tokens[x]))
|
189
|
-
list_aux += my_tokens[x].list_ids
|
190
|
-
|
191
|
-
used.append(x)
|
192
|
-
my_tokens[x].use_it = False
|
193
|
-
value_aux += my_tokens[x].value
|
194
|
-
x += 1
|
195
|
-
else:
|
196
|
-
break
|
197
|
-
#print 'OUT OF THE WHILE'
|
198
|
-
##The last one in the list used is the one accumulating all
|
199
|
-
|
200
|
-
last_pos = used[-1]
|
201
|
-
my_tokens[last_pos].value = value_aux
|
202
|
-
my_tokens[last_pos].list_ids = list_aux
|
203
|
-
my_tokens[last_pos].use_it = True
|
204
|
-
logging.debug(' Regenerating '+str(my_tokens[last_pos]))
|
205
|
-
t = x ## next token
|
206
|
-
#print
|
207
|
-
#print
|
208
|
-
t += 1
|
209
|
-
|
210
|
-
|
211
|
-
## Create OpinionExpression
|
212
|
-
my_opinion_exps = []
|
213
|
-
logging.debug(' Generating output')
|
214
|
-
for token in my_tokens:
|
215
|
-
if token.use_it and token.value != 0:
|
216
|
-
op_exp = OpinionExpression(token.list_ids,token.sentence,token.value)
|
217
|
-
my_opinion_exps.append(op_exp)
|
218
|
-
return my_opinion_exps
|
219
|
-
|
220
|
-
|
221
|
-
'''
|
222
|
-
def get_distance(id1, id2):
|
223
|
-
pos1 = int(id1[id1.find('_')+1:])
|
224
|
-
pos2 = int(id2[id2.find('_')+1:])
|
225
|
-
if pos1>pos2:
|
226
|
-
return pos1-pos2
|
227
|
-
else:
|
228
|
-
return pos2-pos1
|
229
|
-
'''
|
230
|
-
|
231
|
-
|
232
|
-
def obtain_holders(ops_exps,sentences,lang):
|
233
|
-
if lang=='nl':
|
234
|
-
holders = ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun']
|
235
|
-
elif lang=='en':
|
236
|
-
holders = ['i','we','he','she','they','it','you']
|
237
|
-
elif lang =='es':
|
238
|
-
holders = ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras']
|
239
|
-
elif lang =='it':
|
240
|
-
holders = ['io','tu','noi','voi','loro','lei','lui']
|
241
|
-
elif lang == 'de':
|
242
|
-
holders = ['ich','du','wir','ihr','sie','er']
|
243
|
-
elif lang == 'fr':
|
244
|
-
holders = ['je','tu','lui','elle','nous','vous','ils','elles']
|
245
|
-
|
246
|
-
logging.debug('Obtaining holders with list: '+str(holders))
|
247
|
-
|
248
|
-
for oe in ops_exps:
|
249
|
-
sent = oe.sentence
|
250
|
-
list_terms = sentences[str(sent)]
|
251
|
-
for lemma, pos, term_id in list_terms:
|
252
|
-
if lemma in holders:
|
253
|
-
oe.holder.append(term_id)
|
254
|
-
logging.debug(' Selected for '+str(oe)+' holder'+lemma+' '+term_id)
|
255
|
-
break
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
#This is specific for the basic version
|
261
|
-
def filter_candidates(candidates,ids_oe):
|
262
|
-
##filtered = [(lemma, pos,term_id) for (lemma,pos, term_id) in candidates if len(lemma)>=4 and term_id not in ids_oe]
|
263
|
-
filtered = [(lemma,pos,id) for (lemma,pos,id) in candidates if pos in ['N','R']]
|
264
|
-
return filtered
|
265
|
-
|
266
|
-
def obtain_targets_improved(ops_exps,sentences):
|
267
|
-
logging.debug(' Obtaining targets improved')
|
268
|
-
#print>>sys.stderr,'#'*40
|
269
|
-
#print>>sys.stderr,'#'*40
|
270
|
-
|
271
|
-
#print>>sys.stderr,'Beginning with obtain targets'
|
272
|
-
##sentences --> dict [str(numsent)] ==> list of (lemma, term)id
|
273
|
-
|
274
|
-
all_ids_in_oe = []
|
275
|
-
for oe in ops_exps:
|
276
|
-
all_ids_in_oe.extend(oe.ids)
|
277
|
-
#print>>sys.stderr,'All list of ids in oe',all_ids_in_oe
|
278
|
-
|
279
|
-
for oe in ops_exps:
|
280
|
-
#print>>sys.stderr,'\tOE:',oe
|
281
|
-
logging.debug(' OpExp: '+str(oe))
|
282
|
-
|
283
|
-
ids_in_oe = oe.ids
|
284
|
-
sent = oe.sentence
|
285
|
-
list_terms = sentences[str(sent)]
|
286
|
-
#print>>sys.stderr,'\t\tTerms in sent:',list_terms
|
287
|
-
|
288
|
-
###########################################
|
289
|
-
#First rule: noun to the right within maxdistance tokens
|
290
|
-
max_distance_right = 3
|
291
|
-
biggest_index = -1
|
292
|
-
for idx, (lemma,pos,term_id) in enumerate(list_terms):
|
293
|
-
if term_id in ids_in_oe:
|
294
|
-
biggest_index = idx
|
295
|
-
|
296
|
-
#print>>sys.stderr,'\t\tBI',biggest_index
|
297
|
-
if biggest_index+1 >= len(list_terms): ## is the last element and we shall skip it
|
298
|
-
#print>>sys.stderr,'\t\tNot possible to apply 1st rule'
|
299
|
-
pass
|
300
|
-
else:
|
301
|
-
candidates=list_terms[biggest_index+1:min(biggest_index+1+max_distance_right,len(list_terms))]
|
302
|
-
##Filter candidates
|
303
|
-
#print>>sys.stderr,'\t\tCandidates for right rule no filter',candidates
|
304
|
-
#oe.__candidates_right = [(lemma, term_id) for (lemma, term_id) in candidates if len(lemma)>=4 and term_id not in all_ids_in_oe]
|
305
|
-
oe.candidates_r = filter_candidates(candidates,all_ids_in_oe)
|
306
|
-
logging.debug(' Candidates filtered right'+str(oe.candidates_r))
|
307
|
-
#print>>sys.stderr,'\t\tCandidates for right rule no filter',oe.__candidates_right
|
308
|
-
|
309
|
-
######################################################################################
|
310
|
-
|
311
|
-
|
312
|
-
###########################################
|
313
|
-
max_distance_left = 3
|
314
|
-
smallest_index = 0
|
315
|
-
for idx,(lemma,pos,term_id) in enumerate(list_terms):
|
316
|
-
if term_id in ids_in_oe:
|
317
|
-
smallest_index = idx
|
318
|
-
break
|
319
|
-
#print>>sys.stderr,'Smalles index:',smallest_index
|
320
|
-
if smallest_index == 0:
|
321
|
-
#print>>sys.stderr,'\t\tNot possible to apply left rule'
|
322
|
-
pass
|
323
|
-
else:
|
324
|
-
candidates = list_terms[max(0,smallest_index-1-max_distance_left):smallest_index]
|
325
|
-
##Filter candidates
|
326
|
-
#print>>sys.stderr,'\t\tCandidates for left rule no filter',candidates
|
327
|
-
|
328
|
-
oe.candidates_l = filter_candidates(candidates,all_ids_in_oe)
|
329
|
-
logging.debug(' Candidates filtered left: '+str(oe.candidates_l))
|
330
|
-
|
331
|
-
######################################################################################
|
332
|
-
|
333
|
-
#print>>sys.stderr,'#'*40
|
334
|
-
#print>>sys.stderr,'#'*40
|
335
|
-
|
336
|
-
## filling or.target_ids
|
337
|
-
assigned_as_targets = []
|
338
|
-
|
339
|
-
# First we assing to all the first in the right, if any, and not assigned
|
340
|
-
logging.debug(' Applying first to the right rule')
|
341
|
-
for oe in ops_exps:
|
342
|
-
#print>>sys.stderr,'A ver ',oe
|
343
|
-
if len(oe.candidates_r) !=0:
|
344
|
-
lemma, pos, id = oe.candidates_r[0]
|
345
|
-
if id not in assigned_as_targets:
|
346
|
-
oe.target_ids.append(id)
|
347
|
-
###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
|
348
|
-
logging.debug(' OpExp '+str(oe)+' selected '+id)
|
349
|
-
#print>>sys.stderr,'Asignamos',id
|
350
|
-
|
351
|
-
logging.debug(' Applying most close rule')
|
352
|
-
for oe in ops_exps:
|
353
|
-
if len(oe.target_ids) == 0: # otherwise it's solved
|
354
|
-
intercalados_list = mix_lists([id for _,_,id in oe.candidates_r],[id for _,_,id in oe.candidates_l])
|
355
|
-
for id in intercalados_list:
|
356
|
-
if id not in assigned_as_targets:
|
357
|
-
oe.target_ids.append(id)
|
358
|
-
###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
|
359
|
-
logging.debug(' OpExp '+str(oe)+' selected '+id)
|
360
|
-
break
|
361
|
-
|
362
|
-
######## MAIN ROUTINE ############
|
363
|
-
|
364
|
-
## Check if we are reading from a pipeline
|
365
|
-
if sys.stdin.isatty():
|
366
|
-
print>>sys.stderr,'Input stream required.'
|
367
|
-
print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
|
368
|
-
sys.exit(-1)
|
369
|
-
########################################
|
370
|
-
|
371
|
-
logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s - %(message)s',level=logging.DEBUG)
|
372
|
-
|
373
|
-
## Processing the parameters
|
374
|
-
my_time_stamp = True
|
375
|
-
remove_opinions = True
|
376
|
-
opinion_strength = True
|
377
|
-
try:
|
378
|
-
opts, args = getopt.getopt(sys.argv[1:],"",["no-time","no-remove-opinions","no-opinion-strength"])
|
379
|
-
for opt, arg in opts:
|
380
|
-
if opt == "--no-time":
|
381
|
-
my_time_stamp = False
|
382
|
-
elif opt == "--no-remove-opinions":
|
383
|
-
remove_opinions = False
|
384
|
-
elif opt == "--no-opinion-strength":
|
385
|
-
opinion_strength = False
|
386
|
-
except getopt.GetoptError:
|
387
|
-
pass
|
388
|
-
#########################################
|
389
|
-
|
390
|
-
logging.debug('Include timestamp: '+str(my_time_stamp))
|
391
|
-
|
392
|
-
# Parsing the KAF file
|
393
|
-
try:
|
394
|
-
my_kaf_tree = KafParser(sys.stdin)
|
395
|
-
except Exception as e:
|
396
|
-
print>>sys.stderr,'Error parsing input'
|
397
|
-
print>>sys.stderr,'Stream input must be a valid KAF file'
|
398
|
-
print>>sys.stderr,'Error: ',str(e)
|
399
|
-
sys.exit(-1)
|
400
|
-
|
401
|
-
|
402
|
-
lang = my_kaf_tree.getLanguage()
|
403
|
-
## Creating data structure
|
404
|
-
sentences = defaultdict(list)
|
405
|
-
my_tokens = []
|
406
|
-
|
407
|
-
|
408
|
-
# CREATE the datastructure for the tokens
|
409
|
-
n=0
|
410
|
-
lemma_for_tid = {}
|
411
|
-
for term in my_kaf_tree.getTerms():
|
412
|
-
n+=1
|
413
|
-
term_id = term.getId()
|
414
|
-
lemma = term.getLemma()
|
415
|
-
lemma_for_tid[term_id] = lemma
|
416
|
-
kaf_pos = term.getPos()
|
417
|
-
#print>>sys.stderr,kaf_pos
|
418
|
-
list_span = term.get_list_span() ## List of token ids in the span layer of the term
|
419
|
-
sentiment = term.getSentiment()
|
420
|
-
polarity = sent_mod = None
|
421
|
-
if sentiment is not None:
|
422
|
-
polarity = sentiment.getPolarity()
|
423
|
-
sent_mod = sentiment.getSentimentModifier()
|
424
|
-
sentence = my_kaf_tree.getToken(list_span[0]).get('sent') ## The sentence of the first token element in span
|
425
|
-
my_tokens.append(MyToken(term_id,lemma,kaf_pos,polarity,sent_mod,sentence))
|
426
|
-
|
427
|
-
sentences[str(sentence)].append((lemma,kaf_pos,term_id))
|
428
|
-
#############################
|
429
|
-
|
430
|
-
logging.debug('Num terms loaded: '+str(n))
|
431
|
-
logging.debug('Num sentences: '+str(len(sentences)))
|
432
|
-
|
433
|
-
|
434
|
-
logging.debug('Obtaining opinion expressions')
|
435
|
-
my_ops_exps = obtain_opinion_expressions(my_tokens,lang)
|
436
|
-
print>>sys.stderr,my_ops_exps
|
437
|
-
|
438
|
-
logging.debug('Obtaining targets')
|
439
|
-
obtain_targets_improved(my_ops_exps,sentences)
|
440
|
-
|
441
|
-
|
442
|
-
logging.debug('Obtaining holders')
|
443
|
-
obtain_holders(my_ops_exps,sentences,lang)
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
## Create the elements
|
449
|
-
logging.debug('Generating KAF output')
|
450
|
-
|
451
|
-
if remove_opinions:
|
452
|
-
my_kaf_tree.remove_opinion_layer()
|
453
|
-
|
454
|
-
for oe in my_ops_exps:
|
455
|
-
op_ele = etree.Element('opinion')
|
456
|
-
|
457
|
-
## Holder
|
458
|
-
if len(oe.holder)!=0:
|
459
|
-
oe.holder.sort()
|
460
|
-
c = ' '.join(lemma_for_tid[tid] for tid in oe.holder)
|
461
|
-
op_hol = etree.Element('opinion_holder')
|
462
|
-
op_hol.append(etree.Comment(c))
|
463
|
-
op_ele.append(op_hol)
|
464
|
-
span_op_hol = etree.Element('span')
|
465
|
-
op_hol.append(span_op_hol)
|
466
|
-
for id in oe.holder:
|
467
|
-
span_op_hol.append(etree.Element('target',attrib={'id':id}))
|
468
|
-
|
469
|
-
## Target
|
470
|
-
op_tar = etree.Element('opinion_target')
|
471
|
-
op_ele.append(op_tar)
|
472
|
-
|
473
|
-
|
474
|
-
if len(oe.target_ids)!=0: ## if there are no targets, there is no opinion eleemnt
|
475
|
-
oe.target_ids.sort()
|
476
|
-
c = ' '.join(lemma_for_tid[tid] for tid in oe.target_ids)
|
477
|
-
op_tar.append(etree.Comment(c))
|
478
|
-
span_op_tar = etree.Element('span')
|
479
|
-
op_tar.append(span_op_tar)
|
480
|
-
for id in oe.target_ids:
|
481
|
-
span_op_tar.append(etree.Element('target',attrib={'id':id}))
|
482
|
-
|
483
|
-
#Expression
|
484
|
-
if oe.value > 0: pol = 'positive'
|
485
|
-
elif oe.value < 0: pol = 'negative'
|
486
|
-
else: pol = 'neutral'
|
487
|
-
|
488
|
-
op_exp = etree.Element('opinion_expression')
|
489
|
-
op_exp.set('polarity',pol)
|
490
|
-
if opinion_strength:
|
491
|
-
op_exp.set('strength',str(oe.value))
|
492
|
-
|
493
|
-
op_ele.append(op_exp)
|
494
|
-
oe.ids.sort()
|
495
|
-
c = ' '.join(lemma_for_tid[tid] for tid in oe.ids)
|
496
|
-
op_exp.append(etree.Comment(c))
|
497
|
-
span_exp = etree.Element('span')
|
498
|
-
op_exp.append(span_exp)
|
499
|
-
for id in oe.ids:
|
500
|
-
span_exp.append(etree.Element('target',attrib={'id':id}))
|
501
|
-
|
502
|
-
##Append the op_ele to the opinions layer
|
503
|
-
my_kaf_tree.addElementToLayer('opinions', op_ele)
|
504
|
-
|
505
|
-
|
506
|
-
my_kaf_tree.addLinguisticProcessor('Basic opinion detector with Pos','1.0','opinions', my_time_stamp)
|
507
|
-
my_kaf_tree.saveToFile(sys.stdout)
|
508
|
-
logging.debug('Process finished')
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|