opener-opinion-detector-basic 2.0.7 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -5
- data/lib/opener/opinion_detector_basic.rb +12 -65
- data/lib/opener/opinion_detector_basic/opinion.rb +171 -0
- data/lib/opener/opinion_detector_basic/processor.rb +329 -0
- data/lib/opener/opinion_detector_basic/term.rb +160 -0
- data/lib/opener/opinion_detector_basic/version.rb +1 -1
- data/opener-opinion-detector-basic.gemspec +5 -10
- metadata +24 -39
- data/core/opinion_detector_basic_multi.py +0 -512
- data/ext/hack/Rakefile +0 -8
- data/pre_install_requirements.txt +0 -1
@@ -0,0 +1,160 @@
|
|
1
|
+
module Opener
|
2
|
+
class OpinionDetectorBasic
|
3
|
+
class Term
|
4
|
+
attr_reader :node, :sentence, :is_conjunction
|
5
|
+
attr_accessor :use, :accumulated_strength, :list_ids
|
6
|
+
|
7
|
+
def initialize(node, document, language)
|
8
|
+
@node = node
|
9
|
+
@sentence = get_sentence(document)
|
10
|
+
@use = true
|
11
|
+
@accumulated_strength = strength
|
12
|
+
@list_ids = [id]
|
13
|
+
@is_conjunction = is_conjunction?(language)
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the term id.
|
18
|
+
#
|
19
|
+
# @return [String]
|
20
|
+
#
|
21
|
+
def id
|
22
|
+
@id ||= node.get('tid')
|
23
|
+
end
|
24
|
+
|
25
|
+
##
|
26
|
+
# Returns the lemma of the term.
|
27
|
+
#
|
28
|
+
# @return [String]
|
29
|
+
#
|
30
|
+
def lemma
|
31
|
+
@lemma ||= node.get('lemma')
|
32
|
+
end
|
33
|
+
|
34
|
+
##
|
35
|
+
# Returns the part of speech of the term.
|
36
|
+
#
|
37
|
+
# @return [String]
|
38
|
+
#
|
39
|
+
def pos
|
40
|
+
@pos ||= node.get('pos')
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# Returns the sentiment modifier type if it exists.
|
45
|
+
#
|
46
|
+
# @return [String|NilClass]
|
47
|
+
#
|
48
|
+
def sentiment_modifier
|
49
|
+
@sentiment_modifier ||= if sentiment = node.xpath('sentiment').first
|
50
|
+
sentiment.get('sentiment_modifier')
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# Returns the polarity of the term if it exists.
|
56
|
+
#
|
57
|
+
# @return [String|NilClass]
|
58
|
+
#
|
59
|
+
def polarity
|
60
|
+
@polarity ||= if sentiment = node.xpath('sentiment').first
|
61
|
+
sentiment.get('polarity')
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Returns the actual word ids that construct the lemma.
|
67
|
+
#
|
68
|
+
# @return [Array]
|
69
|
+
#
|
70
|
+
def target_ids
|
71
|
+
@target_ids ||= node.xpath('span/target').map {|target| target.get('id')}
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Returns the strength of the term depending on its type.
|
76
|
+
#
|
77
|
+
# @return [Integer]
|
78
|
+
#
|
79
|
+
def strength
|
80
|
+
if polarity == "positive"
|
81
|
+
return 1
|
82
|
+
elsif polarity == "negative"
|
83
|
+
return -1
|
84
|
+
end
|
85
|
+
|
86
|
+
if is_intensifier?
|
87
|
+
return 2
|
88
|
+
elsif is_shifter?
|
89
|
+
return -1
|
90
|
+
end
|
91
|
+
|
92
|
+
return 0
|
93
|
+
end
|
94
|
+
|
95
|
+
##
|
96
|
+
# Returns the sentence id that the term belongs to in the document.
|
97
|
+
#
|
98
|
+
# @return [String]
|
99
|
+
#
|
100
|
+
def get_sentence(document)
|
101
|
+
document
|
102
|
+
.xpath("KAF/text/wf[@wid='#{target_ids.first}']")
|
103
|
+
.first
|
104
|
+
.get('sent')
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
# Checks if a term is an intensifier.
|
109
|
+
#
|
110
|
+
# @return [TrueClass|FalseClass]
|
111
|
+
#
|
112
|
+
def is_intensifier?
|
113
|
+
sentiment_modifier == "intensifier"
|
114
|
+
end
|
115
|
+
|
116
|
+
##
|
117
|
+
# Checks if a term is a shifter.
|
118
|
+
#
|
119
|
+
# @return [TrueClass|FalseClass]
|
120
|
+
#
|
121
|
+
def is_shifter?
|
122
|
+
sentiment_modifier == "shifter"
|
123
|
+
end
|
124
|
+
|
125
|
+
##
|
126
|
+
# Checks if a term is an expression.
|
127
|
+
#
|
128
|
+
# @return [TrueClass|FalseClass]
|
129
|
+
#
|
130
|
+
def is_expression?
|
131
|
+
use && !!polarity
|
132
|
+
end
|
133
|
+
|
134
|
+
##
|
135
|
+
# Checks if a term is a conjunction.
|
136
|
+
#
|
137
|
+
# @return [TrueClass|FalseClass]
|
138
|
+
#
|
139
|
+
def is_conjunction?(language)
|
140
|
+
conjunctions[language].include?(lemma)
|
141
|
+
end
|
142
|
+
|
143
|
+
##
|
144
|
+
# Map of conjunctions per language code
|
145
|
+
#
|
146
|
+
# @return [Hash]
|
147
|
+
#
|
148
|
+
def conjunctions
|
149
|
+
{
|
150
|
+
'nl' => [',','en'],
|
151
|
+
'en' => [',','and'],
|
152
|
+
'es' => [',','y','e'],
|
153
|
+
'it' => [',','e','ed'],
|
154
|
+
'de' => [',','und'],
|
155
|
+
'fr' => [',','et']
|
156
|
+
}
|
157
|
+
end
|
158
|
+
end # Term
|
159
|
+
end # OpinionDetectorBasic
|
160
|
+
end # Opener
|
@@ -7,18 +7,14 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.summary = 'Basic Opinion Detector.'
|
8
8
|
gem.description = gem.summary
|
9
9
|
gem.homepage = 'http://opener-project.github.com/'
|
10
|
-
gem.extensions = ['ext/hack/Rakefile']
|
11
10
|
gem.license = 'Apache 2.0'
|
12
11
|
|
13
12
|
gem.required_ruby_version = '>= 1.9.2'
|
14
|
-
|
13
|
+
|
15
14
|
gem.files = Dir.glob([
|
16
|
-
'core/*',
|
17
|
-
'ext/**/*',
|
18
15
|
'lib/**/*',
|
19
16
|
'config.ru',
|
20
17
|
'*.gemspec',
|
21
|
-
'*_requirements.txt',
|
22
18
|
'README.md',
|
23
19
|
'LICENSE.txt',
|
24
20
|
'exec/**/*',
|
@@ -30,12 +26,11 @@ Gem::Specification.new do |gem|
|
|
30
26
|
gem.add_dependency 'opener-daemons', '~> 2.2'
|
31
27
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
32
28
|
gem.add_dependency 'opener-core', '~> 2.2'
|
33
|
-
|
34
|
-
gem.add_dependency '
|
35
|
-
gem.add_dependency 'nokogiri'
|
36
|
-
gem.add_dependency 'cliver'
|
37
|
-
gem.add_dependency 'slop', '~> 3.5'
|
29
|
+
|
30
|
+
gem.add_dependency 'oga'
|
38
31
|
|
39
32
|
gem.add_development_dependency 'rspec', '~> 3.0'
|
40
33
|
gem.add_development_dependency 'cucumber'
|
34
|
+
gem.add_development_dependency 'rake'
|
35
|
+
gem.add_development_dependency 'benchmark-ips', '~> 2.0'
|
41
36
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-opinion-detector-basic
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
@@ -53,7 +53,7 @@ dependencies:
|
|
53
53
|
prerelease: false
|
54
54
|
type: :runtime
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: oga
|
57
57
|
version_requirements: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - '>='
|
@@ -67,21 +67,21 @@ dependencies:
|
|
67
67
|
prerelease: false
|
68
68
|
type: :runtime
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rspec
|
71
71
|
version_requirements: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - ~>
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
75
|
+
version: '3.0'
|
76
76
|
requirement: !ruby/object:Gem::Requirement
|
77
77
|
requirements:
|
78
|
-
- -
|
78
|
+
- - ~>
|
79
79
|
- !ruby/object:Gem::Version
|
80
|
-
version: '0'
|
80
|
+
version: '3.0'
|
81
81
|
prerelease: false
|
82
|
-
type: :
|
82
|
+
type: :development
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: cucumber
|
85
85
|
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - '>='
|
@@ -93,47 +93,33 @@ dependencies:
|
|
93
93
|
- !ruby/object:Gem::Version
|
94
94
|
version: '0'
|
95
95
|
prerelease: false
|
96
|
-
type: :
|
96
|
+
type: :development
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: rake
|
99
99
|
version_requirements: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- -
|
101
|
+
- - '>='
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
103
|
+
version: '0'
|
104
104
|
requirement: !ruby/object:Gem::Requirement
|
105
105
|
requirements:
|
106
|
-
- -
|
106
|
+
- - '>='
|
107
107
|
- !ruby/object:Gem::Version
|
108
|
-
version: '
|
108
|
+
version: '0'
|
109
109
|
prerelease: false
|
110
|
-
type: :
|
110
|
+
type: :development
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
|
-
name:
|
112
|
+
name: benchmark-ips
|
113
113
|
version_requirements: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
115
|
- - ~>
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
117
|
+
version: '2.0'
|
118
118
|
requirement: !ruby/object:Gem::Requirement
|
119
119
|
requirements:
|
120
120
|
- - ~>
|
121
121
|
- !ruby/object:Gem::Version
|
122
|
-
version: '
|
123
|
-
prerelease: false
|
124
|
-
type: :development
|
125
|
-
- !ruby/object:Gem::Dependency
|
126
|
-
name: cucumber
|
127
|
-
version_requirements: !ruby/object:Gem::Requirement
|
128
|
-
requirements:
|
129
|
-
- - '>='
|
130
|
-
- !ruby/object:Gem::Version
|
131
|
-
version: '0'
|
132
|
-
requirement: !ruby/object:Gem::Requirement
|
133
|
-
requirements:
|
134
|
-
- - '>='
|
135
|
-
- !ruby/object:Gem::Version
|
136
|
-
version: '0'
|
122
|
+
version: '2.0'
|
137
123
|
prerelease: false
|
138
124
|
type: :development
|
139
125
|
description: Basic Opinion Detector.
|
@@ -142,21 +128,20 @@ executables:
|
|
142
128
|
- opinion-detector-basic
|
143
129
|
- opinion-detector-basic-daemon
|
144
130
|
- opinion-detector-basic-server
|
145
|
-
extensions:
|
146
|
-
- ext/hack/Rakefile
|
131
|
+
extensions: []
|
147
132
|
extra_rdoc_files: []
|
148
133
|
files:
|
149
|
-
- core/opinion_detector_basic_multi.py
|
150
|
-
- ext/hack/Rakefile
|
151
134
|
- lib/opener/opinion_detector_basic.rb
|
152
135
|
- lib/opener/opinion_detector_basic/cli.rb
|
136
|
+
- lib/opener/opinion_detector_basic/opinion.rb
|
137
|
+
- lib/opener/opinion_detector_basic/processor.rb
|
153
138
|
- lib/opener/opinion_detector_basic/server.rb
|
139
|
+
- lib/opener/opinion_detector_basic/term.rb
|
154
140
|
- lib/opener/opinion_detector_basic/version.rb
|
155
141
|
- lib/opener/opinion_detector_basic/public/markdown.css
|
156
142
|
- lib/opener/opinion_detector_basic/views/index.erb
|
157
143
|
- config.ru
|
158
144
|
- opener-opinion-detector-basic.gemspec
|
159
|
-
- pre_install_requirements.txt
|
160
145
|
- README.md
|
161
146
|
- LICENSE.txt
|
162
147
|
- exec/opinion-detector-basic.rb
|
@@ -1,512 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
|
3
|
-
import sys
|
4
|
-
import getopt
|
5
|
-
import os
|
6
|
-
|
7
|
-
this_folder = os.path.dirname(os.path.realpath(__file__))
|
8
|
-
|
9
|
-
# This updates the load path to ensure that the local site-packages directory
|
10
|
-
# can be used to load packages (e.g. a locally installed copy of lxml).
|
11
|
-
sys.path.append(os.path.join(this_folder, 'site-packages/pre_install'))
|
12
|
-
|
13
|
-
from VUKafParserPy import KafParser
|
14
|
-
from collections import defaultdict
|
15
|
-
import operator
|
16
|
-
import pprint
|
17
|
-
import lxml
|
18
|
-
from lxml import etree
|
19
|
-
import logging
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
def mix_lists(l1,l2):
|
24
|
-
newl=[]
|
25
|
-
min_l = min(len(l1),len(l2))
|
26
|
-
for x in range(min_l):
|
27
|
-
newl.append(l1[x])
|
28
|
-
newl.append(l2[x])
|
29
|
-
|
30
|
-
if len(l1)>len(l2):
|
31
|
-
newl.extend(l1[min_l:])
|
32
|
-
elif len(l2)>len(l1):
|
33
|
-
newl.extend(l2[min_l:])
|
34
|
-
return newl
|
35
|
-
|
36
|
-
|
37
|
-
class OpinionExpression:
|
38
|
-
def __init__(self,spans,sentence,value):
|
39
|
-
self.ids = spans
|
40
|
-
self.sentence = sentence
|
41
|
-
self.value = value
|
42
|
-
self.target_ids = []
|
43
|
-
self.candidates_r=[]
|
44
|
-
self.candidates_l=[]
|
45
|
-
self.holder = []
|
46
|
-
|
47
|
-
def __repr__(self):
|
48
|
-
r='Ids:'+'#'.join(self.ids)+' Sent:'+self.sentence+' Value:'+str(self.value)+ ' Target:'+'#'.join(self.target_ids)+'\n'
|
49
|
-
r+='Right cand: '+str(self.candidates_r)+'\n'
|
50
|
-
r+='Left cand: '+str(self.candidates_l)+'\n'
|
51
|
-
return r
|
52
|
-
|
53
|
-
class MyToken:
|
54
|
-
def __init__(self,id,lemma,pos,polarity,sent_mod,sent):
|
55
|
-
self.id = id
|
56
|
-
self.lemma = lemma
|
57
|
-
self.pos = pos
|
58
|
-
self.polarity = polarity
|
59
|
-
self.sent_mod = sent_mod
|
60
|
-
self.sentence = sent
|
61
|
-
self.use_it = True
|
62
|
-
self.list_ids = [id]
|
63
|
-
self.value = 0
|
64
|
-
|
65
|
-
|
66
|
-
if polarity == 'positive':
|
67
|
-
self.value = 1
|
68
|
-
elif polarity == 'negative':
|
69
|
-
self.value = -1
|
70
|
-
|
71
|
-
if sent_mod == 'intensifier':
|
72
|
-
self.value = 2
|
73
|
-
elif sent_mod == 'shifter':
|
74
|
-
self.value = -1
|
75
|
-
|
76
|
-
|
77
|
-
def isNegator(self):
|
78
|
-
return self.sent_mod == 'shifter'
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
def isIntensifier(self):
|
83
|
-
return self.sent_mod == 'intensifier'
|
84
|
-
|
85
|
-
|
86
|
-
def is_opinion_expression(self):
|
87
|
-
return self.use_it and self.polarity is not None
|
88
|
-
|
89
|
-
|
90
|
-
def __repr__(self):
|
91
|
-
if self.use_it:
|
92
|
-
return self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
|
93
|
-
else:
|
94
|
-
return '\t'+self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
def obtain_opinion_expressions(tokens,lang='nl'):
|
99
|
-
logging.debug(' Obtaining opinion expressions')
|
100
|
-
my_tokens = tokens[:]
|
101
|
-
|
102
|
-
accumulate_several_modifiers = True
|
103
|
-
apply_modifiers = True
|
104
|
-
apply_conjunctions = True
|
105
|
-
|
106
|
-
## Acumulate doble/triple intensifiers or negators
|
107
|
-
if accumulate_several_modifiers:
|
108
|
-
logging.debug(' Accumulating modifiers')
|
109
|
-
t = 0
|
110
|
-
while t < len(my_tokens):
|
111
|
-
if t+1 < len(my_tokens):
|
112
|
-
if (my_tokens[t].isNegator() or my_tokens[t].isIntensifier()) and my_tokens[t+1].isNegator():
|
113
|
-
my_tokens[t+1].value *= my_tokens[t].value
|
114
|
-
my_tokens[t].use_it = False
|
115
|
-
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
116
|
-
logging.debug(' Accumulating '+'-'.join(my_tokens[t+1].list_ids))
|
117
|
-
elif my_tokens[t].isNegator() and my_tokens[t+1].isIntensifier():
|
118
|
-
my_tokens[t+1].value *= -1
|
119
|
-
my_tokens[t].use_it = False
|
120
|
-
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
121
|
-
logging.debug(' Accumulating '+'-'.join(my_tokens[t+1].list_ids))
|
122
|
-
elif my_tokens[t].isIntensifier() and my_tokens[t+1].isIntensifier():
|
123
|
-
if my_tokens[t].value >= 0:
|
124
|
-
my_tokens[t+1].value = my_tokens[t].value + my_tokens[t+1].value
|
125
|
-
else:
|
126
|
-
my_tokens[t+1].value = my_tokens[t].value - my_tokens[t+1].value
|
127
|
-
my_tokens[t].use_it = False
|
128
|
-
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
129
|
-
logging.debug(' Accumulating '+'-'.join(my_tokens[t+1].list_ids))
|
130
|
-
|
131
|
-
t+=1
|
132
|
-
###########################################
|
133
|
-
|
134
|
-
##Apply intensifiers/negators over the next elements
|
135
|
-
if apply_modifiers:
|
136
|
-
logging.debug(' Applying modifiers')
|
137
|
-
t = 0
|
138
|
-
while t < len(my_tokens):
|
139
|
-
if my_tokens[t].use_it and (my_tokens[t].isNegator() or my_tokens[t].isIntensifier()):
|
140
|
-
## Try to modify the next token:
|
141
|
-
if t+1<len(my_tokens):
|
142
|
-
#print 'Score: ',my_tokens[t]
|
143
|
-
my_tokens[t+1].value *= my_tokens[t].value
|
144
|
-
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
145
|
-
my_tokens[t].use_it = False
|
146
|
-
logging.debug(' Applied modifier over '+'-'.join(my_tokens[t+1].list_ids))
|
147
|
-
t += 1
|
148
|
-
###########################################
|
149
|
-
|
150
|
-
if apply_conjunctions:
|
151
|
-
if lang=='nl':
|
152
|
-
concat = [',','en']
|
153
|
-
elif lang=='en':
|
154
|
-
concat = [',','and']
|
155
|
-
elif lang=='es':
|
156
|
-
concat = [',','y','e']
|
157
|
-
elif lang=='it':
|
158
|
-
concat = [',','e','ed']
|
159
|
-
elif lang=='de':
|
160
|
-
concat = [',','und']
|
161
|
-
elif lang == 'fr':
|
162
|
-
concat=[',','et']
|
163
|
-
logging.debug(' Applying conjunctions:'+str(concat))
|
164
|
-
|
165
|
-
|
166
|
-
t = 0
|
167
|
-
while t < len(my_tokens):
|
168
|
-
if my_tokens[t].use_it and my_tokens[t].value!=0: ## Find the first one
|
169
|
-
#print 'FOUND ',my_tokens[t]
|
170
|
-
logging.debug(' Found token '+str(my_tokens[t]))
|
171
|
-
list_aux = my_tokens[t].list_ids
|
172
|
-
used = [t]
|
173
|
-
value_aux = my_tokens[t].value
|
174
|
-
my_tokens[t].use_it = False
|
175
|
-
#print 'Modified',my_tokens[t]
|
176
|
-
|
177
|
-
x = t+1
|
178
|
-
while True:
|
179
|
-
if x>=len(my_tokens):
|
180
|
-
break
|
181
|
-
|
182
|
-
if my_tokens[x].lemma in concat:
|
183
|
-
## list_aux += my_tokens[x].list_ids Dont use it as part of the OE
|
184
|
-
my_tokens[x].use_it = False
|
185
|
-
x+=1
|
186
|
-
elif (my_tokens[x].use_it and my_tokens[x].value!=0):
|
187
|
-
#print '\Also ',my_tokens[x]
|
188
|
-
logging.debug(' Found token '+str(my_tokens[x]))
|
189
|
-
list_aux += my_tokens[x].list_ids
|
190
|
-
|
191
|
-
used.append(x)
|
192
|
-
my_tokens[x].use_it = False
|
193
|
-
value_aux += my_tokens[x].value
|
194
|
-
x += 1
|
195
|
-
else:
|
196
|
-
break
|
197
|
-
#print 'OUT OF THE WHILE'
|
198
|
-
##The last one in the list used is the one accumulating all
|
199
|
-
|
200
|
-
last_pos = used[-1]
|
201
|
-
my_tokens[last_pos].value = value_aux
|
202
|
-
my_tokens[last_pos].list_ids = list_aux
|
203
|
-
my_tokens[last_pos].use_it = True
|
204
|
-
logging.debug(' Regenerating '+str(my_tokens[last_pos]))
|
205
|
-
t = x ## next token
|
206
|
-
#print
|
207
|
-
#print
|
208
|
-
t += 1
|
209
|
-
|
210
|
-
|
211
|
-
## Create OpinionExpression
|
212
|
-
my_opinion_exps = []
|
213
|
-
logging.debug(' Generating output')
|
214
|
-
for token in my_tokens:
|
215
|
-
if token.use_it and token.value != 0:
|
216
|
-
op_exp = OpinionExpression(token.list_ids,token.sentence,token.value)
|
217
|
-
my_opinion_exps.append(op_exp)
|
218
|
-
return my_opinion_exps
|
219
|
-
|
220
|
-
|
221
|
-
'''
|
222
|
-
def get_distance(id1, id2):
|
223
|
-
pos1 = int(id1[id1.find('_')+1:])
|
224
|
-
pos2 = int(id2[id2.find('_')+1:])
|
225
|
-
if pos1>pos2:
|
226
|
-
return pos1-pos2
|
227
|
-
else:
|
228
|
-
return pos2-pos1
|
229
|
-
'''
|
230
|
-
|
231
|
-
|
232
|
-
def obtain_holders(ops_exps,sentences,lang):
|
233
|
-
if lang=='nl':
|
234
|
-
holders = ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun']
|
235
|
-
elif lang=='en':
|
236
|
-
holders = ['i','we','he','she','they','it','you']
|
237
|
-
elif lang =='es':
|
238
|
-
holders = ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras']
|
239
|
-
elif lang =='it':
|
240
|
-
holders = ['io','tu','noi','voi','loro','lei','lui']
|
241
|
-
elif lang == 'de':
|
242
|
-
holders = ['ich','du','wir','ihr','sie','er']
|
243
|
-
elif lang == 'fr':
|
244
|
-
holders = ['je','tu','lui','elle','nous','vous','ils','elles']
|
245
|
-
|
246
|
-
logging.debug('Obtaining holders with list: '+str(holders))
|
247
|
-
|
248
|
-
for oe in ops_exps:
|
249
|
-
sent = oe.sentence
|
250
|
-
list_terms = sentences[str(sent)]
|
251
|
-
for lemma, pos, term_id in list_terms:
|
252
|
-
if lemma in holders:
|
253
|
-
oe.holder.append(term_id)
|
254
|
-
logging.debug(' Selected for '+str(oe)+' holder'+lemma+' '+term_id)
|
255
|
-
break
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
#This is specific for the basic version
|
261
|
-
def filter_candidates(candidates,ids_oe):
|
262
|
-
##filtered = [(lemma, pos,term_id) for (lemma,pos, term_id) in candidates if len(lemma)>=4 and term_id not in ids_oe]
|
263
|
-
filtered = [(lemma,pos,id) for (lemma,pos,id) in candidates if pos in ['N','R']]
|
264
|
-
return filtered
|
265
|
-
|
266
|
-
def obtain_targets_improved(ops_exps,sentences):
|
267
|
-
logging.debug(' Obtaining targets improved')
|
268
|
-
#print>>sys.stderr,'#'*40
|
269
|
-
#print>>sys.stderr,'#'*40
|
270
|
-
|
271
|
-
#print>>sys.stderr,'Beginning with obtain targets'
|
272
|
-
##sentences --> dict [str(numsent)] ==> list of (lemma, term)id
|
273
|
-
|
274
|
-
all_ids_in_oe = []
|
275
|
-
for oe in ops_exps:
|
276
|
-
all_ids_in_oe.extend(oe.ids)
|
277
|
-
#print>>sys.stderr,'All list of ids in oe',all_ids_in_oe
|
278
|
-
|
279
|
-
for oe in ops_exps:
|
280
|
-
#print>>sys.stderr,'\tOE:',oe
|
281
|
-
logging.debug(' OpExp: '+str(oe))
|
282
|
-
|
283
|
-
ids_in_oe = oe.ids
|
284
|
-
sent = oe.sentence
|
285
|
-
list_terms = sentences[str(sent)]
|
286
|
-
#print>>sys.stderr,'\t\tTerms in sent:',list_terms
|
287
|
-
|
288
|
-
###########################################
|
289
|
-
#First rule: noun to the right within maxdistance tokens
|
290
|
-
max_distance_right = 3
|
291
|
-
biggest_index = -1
|
292
|
-
for idx, (lemma,pos,term_id) in enumerate(list_terms):
|
293
|
-
if term_id in ids_in_oe:
|
294
|
-
biggest_index = idx
|
295
|
-
|
296
|
-
#print>>sys.stderr,'\t\tBI',biggest_index
|
297
|
-
if biggest_index+1 >= len(list_terms): ## is the last element and we shall skip it
|
298
|
-
#print>>sys.stderr,'\t\tNot possible to apply 1st rule'
|
299
|
-
pass
|
300
|
-
else:
|
301
|
-
candidates=list_terms[biggest_index+1:min(biggest_index+1+max_distance_right,len(list_terms))]
|
302
|
-
##Filter candidates
|
303
|
-
#print>>sys.stderr,'\t\tCandidates for right rule no filter',candidates
|
304
|
-
#oe.__candidates_right = [(lemma, term_id) for (lemma, term_id) in candidates if len(lemma)>=4 and term_id not in all_ids_in_oe]
|
305
|
-
oe.candidates_r = filter_candidates(candidates,all_ids_in_oe)
|
306
|
-
logging.debug(' Candidates filtered right'+str(oe.candidates_r))
|
307
|
-
#print>>sys.stderr,'\t\tCandidates for right rule no filter',oe.__candidates_right
|
308
|
-
|
309
|
-
######################################################################################
|
310
|
-
|
311
|
-
|
312
|
-
###########################################
|
313
|
-
max_distance_left = 3
|
314
|
-
smallest_index = 0
|
315
|
-
for idx,(lemma,pos,term_id) in enumerate(list_terms):
|
316
|
-
if term_id in ids_in_oe:
|
317
|
-
smallest_index = idx
|
318
|
-
break
|
319
|
-
#print>>sys.stderr,'Smalles index:',smallest_index
|
320
|
-
if smallest_index == 0:
|
321
|
-
#print>>sys.stderr,'\t\tNot possible to apply left rule'
|
322
|
-
pass
|
323
|
-
else:
|
324
|
-
candidates = list_terms[max(0,smallest_index-1-max_distance_left):smallest_index]
|
325
|
-
##Filter candidates
|
326
|
-
#print>>sys.stderr,'\t\tCandidates for left rule no filter',candidates
|
327
|
-
|
328
|
-
oe.candidates_l = filter_candidates(candidates,all_ids_in_oe)
|
329
|
-
logging.debug(' Candidates filtered left: '+str(oe.candidates_l))
|
330
|
-
|
331
|
-
######################################################################################
|
332
|
-
|
333
|
-
#print>>sys.stderr,'#'*40
|
334
|
-
#print>>sys.stderr,'#'*40
|
335
|
-
|
336
|
-
## filling or.target_ids
|
337
|
-
assigned_as_targets = []
|
338
|
-
|
339
|
-
# First we assing to all the first in the right, if any, and not assigned
|
340
|
-
logging.debug(' Applying first to the right rule')
|
341
|
-
for oe in ops_exps:
|
342
|
-
#print>>sys.stderr,'A ver ',oe
|
343
|
-
if len(oe.candidates_r) !=0:
|
344
|
-
lemma, pos, id = oe.candidates_r[0]
|
345
|
-
if id not in assigned_as_targets:
|
346
|
-
oe.target_ids.append(id)
|
347
|
-
###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
|
348
|
-
logging.debug(' OpExp '+str(oe)+' selected '+id)
|
349
|
-
#print>>sys.stderr,'Asignamos',id
|
350
|
-
|
351
|
-
logging.debug(' Applying most close rule')
|
352
|
-
for oe in ops_exps:
|
353
|
-
if len(oe.target_ids) == 0: # otherwise it's solved
|
354
|
-
intercalados_list = mix_lists([id for _,_,id in oe.candidates_r],[id for _,_,id in oe.candidates_l])
|
355
|
-
for id in intercalados_list:
|
356
|
-
if id not in assigned_as_targets:
|
357
|
-
oe.target_ids.append(id)
|
358
|
-
###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
|
359
|
-
logging.debug(' OpExp '+str(oe)+' selected '+id)
|
360
|
-
break
|
361
|
-
|
362
|
-
######## MAIN ROUTINE ############
|
363
|
-
|
364
|
-
## Check if we are reading from a pipeline
|
365
|
-
if sys.stdin.isatty():
|
366
|
-
print>>sys.stderr,'Input stream required.'
|
367
|
-
print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
|
368
|
-
sys.exit(-1)
|
369
|
-
########################################
|
370
|
-
|
371
|
-
logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s - %(message)s',level=logging.DEBUG)
|
372
|
-
|
373
|
-
## Processing the parameters
|
374
|
-
my_time_stamp = True
|
375
|
-
remove_opinions = True
|
376
|
-
opinion_strength = True
|
377
|
-
try:
|
378
|
-
opts, args = getopt.getopt(sys.argv[1:],"",["no-time","no-remove-opinions","no-opinion-strength"])
|
379
|
-
for opt, arg in opts:
|
380
|
-
if opt == "--no-time":
|
381
|
-
my_time_stamp = False
|
382
|
-
elif opt == "--no-remove-opinions":
|
383
|
-
remove_opinions = False
|
384
|
-
elif opt == "--no-opinion-strength":
|
385
|
-
opinion_strength = False
|
386
|
-
except getopt.GetoptError:
|
387
|
-
pass
|
388
|
-
#########################################
|
389
|
-
|
390
|
-
logging.debug('Include timestamp: '+str(my_time_stamp))
|
391
|
-
|
392
|
-
# Parsing the KAF file
|
393
|
-
try:
|
394
|
-
my_kaf_tree = KafParser(sys.stdin)
|
395
|
-
except Exception as e:
|
396
|
-
print>>sys.stderr,'Error parsing input'
|
397
|
-
print>>sys.stderr,'Stream input must be a valid KAF file'
|
398
|
-
print>>sys.stderr,'Error: ',str(e)
|
399
|
-
sys.exit(-1)
|
400
|
-
|
401
|
-
|
402
|
-
lang = my_kaf_tree.getLanguage()
|
403
|
-
## Creating data structure
|
404
|
-
sentences = defaultdict(list)
|
405
|
-
my_tokens = []
|
406
|
-
|
407
|
-
|
408
|
-
# CREATE the datastructure for the tokens
|
409
|
-
n=0
|
410
|
-
lemma_for_tid = {}
|
411
|
-
for term in my_kaf_tree.getTerms():
|
412
|
-
n+=1
|
413
|
-
term_id = term.getId()
|
414
|
-
lemma = term.getLemma()
|
415
|
-
lemma_for_tid[term_id] = lemma
|
416
|
-
kaf_pos = term.getPos()
|
417
|
-
#print>>sys.stderr,kaf_pos
|
418
|
-
list_span = term.get_list_span() ## List of token ids in the span layer of the term
|
419
|
-
sentiment = term.getSentiment()
|
420
|
-
polarity = sent_mod = None
|
421
|
-
if sentiment is not None:
|
422
|
-
polarity = sentiment.getPolarity()
|
423
|
-
sent_mod = sentiment.getSentimentModifier()
|
424
|
-
sentence = my_kaf_tree.getToken(list_span[0]).get('sent') ## The sentence of the first token element in span
|
425
|
-
my_tokens.append(MyToken(term_id,lemma,kaf_pos,polarity,sent_mod,sentence))
|
426
|
-
|
427
|
-
sentences[str(sentence)].append((lemma,kaf_pos,term_id))
|
428
|
-
#############################
|
429
|
-
|
430
|
-
logging.debug('Num terms loaded: '+str(n))
|
431
|
-
logging.debug('Num sentences: '+str(len(sentences)))
|
432
|
-
|
433
|
-
|
434
|
-
logging.debug('Obtaining opinion expressions')
|
435
|
-
my_ops_exps = obtain_opinion_expressions(my_tokens,lang)
|
436
|
-
print>>sys.stderr,my_ops_exps
|
437
|
-
|
438
|
-
logging.debug('Obtaining targets')
|
439
|
-
obtain_targets_improved(my_ops_exps,sentences)
|
440
|
-
|
441
|
-
|
442
|
-
logging.debug('Obtaining holders')
|
443
|
-
obtain_holders(my_ops_exps,sentences,lang)
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
## Create the elements
|
449
|
-
logging.debug('Generating KAF output')
|
450
|
-
|
451
|
-
if remove_opinions:
|
452
|
-
my_kaf_tree.remove_opinion_layer()
|
453
|
-
|
454
|
-
for oe in my_ops_exps:
|
455
|
-
op_ele = etree.Element('opinion')
|
456
|
-
|
457
|
-
## Holder
|
458
|
-
if len(oe.holder)!=0:
|
459
|
-
oe.holder.sort()
|
460
|
-
c = ' '.join(lemma_for_tid[tid] for tid in oe.holder)
|
461
|
-
op_hol = etree.Element('opinion_holder')
|
462
|
-
op_hol.append(etree.Comment(c))
|
463
|
-
op_ele.append(op_hol)
|
464
|
-
span_op_hol = etree.Element('span')
|
465
|
-
op_hol.append(span_op_hol)
|
466
|
-
for id in oe.holder:
|
467
|
-
span_op_hol.append(etree.Element('target',attrib={'id':id}))
|
468
|
-
|
469
|
-
## Target
|
470
|
-
op_tar = etree.Element('opinion_target')
|
471
|
-
op_ele.append(op_tar)
|
472
|
-
|
473
|
-
|
474
|
-
if len(oe.target_ids)!=0: ## if there are no targets, there is no opinion eleemnt
|
475
|
-
oe.target_ids.sort()
|
476
|
-
c = ' '.join(lemma_for_tid[tid] for tid in oe.target_ids)
|
477
|
-
op_tar.append(etree.Comment(c))
|
478
|
-
span_op_tar = etree.Element('span')
|
479
|
-
op_tar.append(span_op_tar)
|
480
|
-
for id in oe.target_ids:
|
481
|
-
span_op_tar.append(etree.Element('target',attrib={'id':id}))
|
482
|
-
|
483
|
-
#Expression
|
484
|
-
if oe.value > 0: pol = 'positive'
|
485
|
-
elif oe.value < 0: pol = 'negative'
|
486
|
-
else: pol = 'neutral'
|
487
|
-
|
488
|
-
op_exp = etree.Element('opinion_expression')
|
489
|
-
op_exp.set('polarity',pol)
|
490
|
-
if opinion_strength:
|
491
|
-
op_exp.set('strength',str(oe.value))
|
492
|
-
|
493
|
-
op_ele.append(op_exp)
|
494
|
-
oe.ids.sort()
|
495
|
-
c = ' '.join(lemma_for_tid[tid] for tid in oe.ids)
|
496
|
-
op_exp.append(etree.Comment(c))
|
497
|
-
span_exp = etree.Element('span')
|
498
|
-
op_exp.append(span_exp)
|
499
|
-
for id in oe.ids:
|
500
|
-
span_exp.append(etree.Element('target',attrib={'id':id}))
|
501
|
-
|
502
|
-
##Append the op_ele to the opinions layer
|
503
|
-
my_kaf_tree.addElementToLayer('opinions', op_ele)
|
504
|
-
|
505
|
-
|
506
|
-
my_kaf_tree.addLinguisticProcessor('Basic opinion detector with Pos','1.0','opinions', my_time_stamp)
|
507
|
-
my_kaf_tree.saveToFile(sys.stdout)
|
508
|
-
logging.debug('Process finished')
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|