opener-opinion-detector-basic 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +30 -0
- data/bin/opinion-detector-basic +19 -0
- data/bin/opinion-detector-basic-server +10 -0
- data/config.ru +4 -0
- data/core/opinion_detector_basic_multi.py +499 -0
- data/core/packages/KafNafParser-1.3.tar.gz +0 -0
- data/core/packages/VUA_pylib-1.4.tar.gz +0 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
- data/core/vendor/src/crfsuite/AUTHORS +1 -0
- data/core/vendor/src/crfsuite/COPYING +27 -0
- data/core/vendor/src/crfsuite/ChangeLog +103 -0
- data/core/vendor/src/crfsuite/INSTALL +236 -0
- data/core/vendor/src/crfsuite/Makefile.am +19 -0
- data/core/vendor/src/crfsuite/Makefile.in +783 -0
- data/core/vendor/src/crfsuite/README +183 -0
- data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
- data/core/vendor/src/crfsuite/autogen.sh +38 -0
- data/core/vendor/src/crfsuite/compile +143 -0
- data/core/vendor/src/crfsuite/config.guess +1502 -0
- data/core/vendor/src/crfsuite/config.h.in +198 -0
- data/core/vendor/src/crfsuite/config.sub +1714 -0
- data/core/vendor/src/crfsuite/configure +14273 -0
- data/core/vendor/src/crfsuite/configure.in +149 -0
- data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
- data/core/vendor/src/crfsuite/depcomp +630 -0
- data/core/vendor/src/crfsuite/example/chunking.py +49 -0
- data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
- data/core/vendor/src/crfsuite/example/ner.py +270 -0
- data/core/vendor/src/crfsuite/example/pos.py +78 -0
- data/core/vendor/src/crfsuite/example/template.py +88 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
- data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
- data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
- data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
- data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
- data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
- data/core/vendor/src/crfsuite/frontend/main.c +137 -0
- data/core/vendor/src/crfsuite/frontend/option.c +93 -0
- data/core/vendor/src/crfsuite/frontend/option.h +86 -0
- data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
- data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
- data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
- data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
- data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
- data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
- data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
- data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
- data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
- data/core/vendor/src/crfsuite/include/os.h +61 -0
- data/core/vendor/src/crfsuite/install-sh +520 -0
- data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
- data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
- data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
- data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
- data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
- data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
- data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
- data/core/vendor/src/crfsuite/missing +376 -0
- data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
- data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
- data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
- data/core/vendor/src/crfsuite/swig/export.i +32 -0
- data/core/vendor/src/crfsuite/swig/python/README +92 -0
- data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
- data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
- data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
- data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
- data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
- data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
- data/core/vendor/src/liblbfgs/AUTHORS +1 -0
- data/core/vendor/src/liblbfgs/COPYING +22 -0
- data/core/vendor/src/liblbfgs/ChangeLog +120 -0
- data/core/vendor/src/liblbfgs/INSTALL +231 -0
- data/core/vendor/src/liblbfgs/Makefile.am +10 -0
- data/core/vendor/src/liblbfgs/Makefile.in +638 -0
- data/core/vendor/src/liblbfgs/NEWS +0 -0
- data/core/vendor/src/liblbfgs/README +71 -0
- data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
- data/core/vendor/src/liblbfgs/autogen.sh +38 -0
- data/core/vendor/src/liblbfgs/config.guess +1411 -0
- data/core/vendor/src/liblbfgs/config.h.in +64 -0
- data/core/vendor/src/liblbfgs/config.sub +1500 -0
- data/core/vendor/src/liblbfgs/configure +21146 -0
- data/core/vendor/src/liblbfgs/configure.in +107 -0
- data/core/vendor/src/liblbfgs/depcomp +522 -0
- data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
- data/core/vendor/src/liblbfgs/install-sh +322 -0
- data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
- data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
- data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
- data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
- data/core/vendor/src/liblbfgs/missing +353 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
- data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
- data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
- data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
- data/core/vendor/src/svm_light/LICENSE.txt +59 -0
- data/core/vendor/src/svm_light/Makefile +105 -0
- data/core/vendor/src/svm_light/kernel.h +40 -0
- data/core/vendor/src/svm_light/svm_classify.c +197 -0
- data/core/vendor/src/svm_light/svm_common.c +985 -0
- data/core/vendor/src/svm_light/svm_common.h +301 -0
- data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
- data/core/vendor/src/svm_light/svm_learn.c +4147 -0
- data/core/vendor/src/svm_light/svm_learn.h +169 -0
- data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
- data/core/vendor/src/svm_light/svm_loqo.c +211 -0
- data/ext/hack/Rakefile +17 -0
- data/ext/hack/support.rb +88 -0
- data/lib/opener/opinion_detector_basic.rb +91 -0
- data/lib/opener/opinion_detector_basic/public/markdown.css +284 -0
- data/lib/opener/opinion_detector_basic/server.rb +16 -0
- data/lib/opener/opinion_detector_basic/version.rb +5 -0
- data/lib/opener/opinion_detector_basic/views/index.erb +97 -0
- data/lib/opener/opinion_detector_basic/views/result.erb +15 -0
- data/opener-opinion-detector-basic.gemspec +36 -0
- data/pre_build_requirements.txt +1 -0
- metadata +309 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cc6dd6c71396853ddb39ff898599ad4c3f466401
|
4
|
+
data.tar.gz: c820abad54167482947c7a56d78cba5331482957
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ee6555c6dee3540ed4b7a7817e106cfb459281f159c60d73b2af548b9bc19b967ce70bc52e8bac991eabdab6b74a6d30606f7672c339d75c999bf2666a8a66ec
|
7
|
+
data.tar.gz: 72fc4d27ce5b98fab608880492a2c7b135e24542eb9b2cb26b3bd4c32008095f966ec944588d30aaeb278c265bdb65bd22688dd43649df7cd16d9ea0e99ece55
|
data/README.md
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
Opinion Detector Basic
|
2
|
+
======================
|
3
|
+
|
4
|
+
This module implements a opinion detector for English (also works for Dutch and
|
5
|
+
German). The language is determined by the "xml:lang" attribut in the input KAF
|
6
|
+
file. Depending on the value of this attribute, the corresponding lexicon will
|
7
|
+
be loaded. This module detects three elements of the opinions:
|
8
|
+
|
9
|
+
* Expression: the actual opinion expression
|
10
|
+
* Target: about what is the previous expression
|
11
|
+
* Holder: who is stating that expression
|
12
|
+
|
13
|
+
Requirements
|
14
|
+
-----------
|
15
|
+
* VUKafParserPy: parser in python for KAF files
|
16
|
+
* lxml: library for processing xml in python
|
17
|
+
|
18
|
+
Usage
|
19
|
+
----
|
20
|
+
|
21
|
+
The input KAF file has to be annotated with at least the term layer, with
|
22
|
+
polarity information. Correct input files for this module are the output KAF
|
23
|
+
files from the polarity tagger module
|
24
|
+
|
25
|
+
To tag an input KAF file example.kaf with opinions you can run:
|
26
|
+
|
27
|
+
$ cat example.with.polaritieskaf | core/opinion_detector_basic_multi.py > output.with.opinions.kaf
|
28
|
+
|
29
|
+
The output will the input KAF file extended with the opinion layer.
|
30
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/opener/opinion_detector_basic'
|
4
|
+
|
5
|
+
# STDIN.tty? returns `false` if data is being piped into the current process.
|
6
|
+
if STDIN.tty?
|
7
|
+
input = nil
|
8
|
+
else
|
9
|
+
input = STDIN.read
|
10
|
+
end
|
11
|
+
|
12
|
+
kernel = Opener::OpinionDetectorBasic.new(:args => ARGV)
|
13
|
+
stdout, stderr, process = kernel.run(input)
|
14
|
+
|
15
|
+
if process.success?
|
16
|
+
puts stdout
|
17
|
+
else
|
18
|
+
abort stderr
|
19
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rack'
|
4
|
+
|
5
|
+
# Without calling `Rack::Server#options` manually the CLI arguments will never
|
6
|
+
# be passed, thus the application can't be specified as a constructor argument.
|
7
|
+
server = Rack::Server.new
|
8
|
+
server.options[:config] = File.expand_path('../../config.ru', __FILE__)
|
9
|
+
|
10
|
+
server.start
|
data/config.ru
ADDED
@@ -0,0 +1,499 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import sys
|
4
|
+
import getopt
|
5
|
+
import os
|
6
|
+
|
7
|
+
this_folder = os.path.dirname(os.path.realpath(__file__))
|
8
|
+
|
9
|
+
# This updates the load path to ensure that the local site-packages directory
|
10
|
+
# can be used to load packages (e.g. a locally installed copy of lxml).
|
11
|
+
sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
|
12
|
+
|
13
|
+
from VUKafParserPy import KafParser
|
14
|
+
from collections import defaultdict
|
15
|
+
import operator
|
16
|
+
import pprint
|
17
|
+
import lxml
|
18
|
+
from lxml import etree
|
19
|
+
import logging
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
def mix_lists(l1,l2):
|
24
|
+
newl=[]
|
25
|
+
min_l = min(len(l1),len(l2))
|
26
|
+
for x in range(min_l):
|
27
|
+
newl.append(l1[x])
|
28
|
+
newl.append(l2[x])
|
29
|
+
|
30
|
+
if len(l1)>len(l2):
|
31
|
+
newl.extend(l1[min_l:])
|
32
|
+
elif len(l2)>len(l1):
|
33
|
+
newl.extend(l2[min_l:])
|
34
|
+
return newl
|
35
|
+
|
36
|
+
|
37
|
+
class OpinionExpression:
|
38
|
+
def __init__(self,spans,sentence,value):
|
39
|
+
self.ids = spans
|
40
|
+
self.sentence = sentence
|
41
|
+
self.value = value
|
42
|
+
self.target_ids = []
|
43
|
+
self.candidates_r=[]
|
44
|
+
self.candidates_l=[]
|
45
|
+
self.holder = []
|
46
|
+
|
47
|
+
def __repr__(self):
|
48
|
+
r='Ids:'+'#'.join(self.ids)+' Sent:'+self.sentence+' Value:'+str(self.value)+ ' Target:'+'#'.join(self.target_ids)+'\n'
|
49
|
+
r+='Right cand: '+str(self.candidates_r)+'\n'
|
50
|
+
r+='Left cand: '+str(self.candidates_l)+'\n'
|
51
|
+
return r
|
52
|
+
|
53
|
+
class MyToken:
|
54
|
+
def __init__(self,id,lemma,pos,polarity,sent_mod,sent):
|
55
|
+
self.id = id
|
56
|
+
self.lemma = lemma
|
57
|
+
self.pos = pos
|
58
|
+
self.polarity = polarity
|
59
|
+
self.sent_mod = sent_mod
|
60
|
+
self.sentence = sent
|
61
|
+
self.use_it = True
|
62
|
+
self.list_ids = [id]
|
63
|
+
self.value = 0
|
64
|
+
|
65
|
+
|
66
|
+
if polarity == 'positive':
|
67
|
+
self.value = 1
|
68
|
+
elif polarity == 'negative':
|
69
|
+
self.value = -1
|
70
|
+
|
71
|
+
if sent_mod == 'intensifier':
|
72
|
+
self.value = 2
|
73
|
+
elif sent_mod == 'shifter':
|
74
|
+
self.value = -1
|
75
|
+
|
76
|
+
|
77
|
+
def isNegator(self):
|
78
|
+
return self.sent_mod == 'shifter'
|
79
|
+
|
80
|
+
|
81
|
+
|
82
|
+
def isIntensifier(self):
|
83
|
+
return self.sent_mod == 'intensifier'
|
84
|
+
|
85
|
+
|
86
|
+
def is_opinion_expression(self):
|
87
|
+
return self.use_it and self.polarity is not None
|
88
|
+
|
89
|
+
|
90
|
+
def __repr__(self):
|
91
|
+
if self.use_it:
|
92
|
+
return self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
|
93
|
+
else:
|
94
|
+
return '\t'+self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
|
95
|
+
|
96
|
+
|
97
|
+
|
98
|
+
def obtain_opinion_expressions(tokens,lang='nl'):
|
99
|
+
logging.debug(' Obtaining opinion expressions')
|
100
|
+
my_tokens = tokens[:]
|
101
|
+
|
102
|
+
accumulate_several_modifiers = True
|
103
|
+
apply_modifiers = True
|
104
|
+
apply_conjunctions = True
|
105
|
+
|
106
|
+
## Acumulate doble/triple intensifiers or negators
|
107
|
+
if accumulate_several_modifiers:
|
108
|
+
logging.debug(' Accumulating modifiers')
|
109
|
+
t = 0
|
110
|
+
while t < len(my_tokens):
|
111
|
+
if my_tokens[t].isNegator() or my_tokens[t].isIntensifier():
|
112
|
+
if t+1 < len(my_tokens) and ( my_tokens[t+1].isNegator() or my_tokens[t+1].isIntensifier()):
|
113
|
+
## There are 2 negators/intensifiers next to each other
|
114
|
+
## The first one is deactivated and the second one is modified
|
115
|
+
my_tokens[t].use_it = False
|
116
|
+
my_tokens[t+1].value *= my_tokens[t].value
|
117
|
+
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
118
|
+
logging.debug(' Accucumating '+'-'.join(my_tokens[t+1].list_ids))
|
119
|
+
t+=1
|
120
|
+
###########################################
|
121
|
+
|
122
|
+
##Apply intensifiers/negators over the next elements
|
123
|
+
if apply_modifiers:
|
124
|
+
logging.debug(' Applying modifiers')
|
125
|
+
t = 0
|
126
|
+
while t < len(my_tokens):
|
127
|
+
if my_tokens[t].use_it and (my_tokens[t].isNegator() or my_tokens[t].isIntensifier()):
|
128
|
+
## Try to modify the next token:
|
129
|
+
if t+1<len(my_tokens):
|
130
|
+
my_tokens[t+1].value *= my_tokens[t].value
|
131
|
+
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
132
|
+
my_tokens[t].use_it = False
|
133
|
+
logging.debug(' Applied modifier over '+'-'.join(my_tokens[t+1].list_ids))
|
134
|
+
t += 1
|
135
|
+
###########################################
|
136
|
+
|
137
|
+
if apply_conjunctions:
|
138
|
+
if lang=='nl':
|
139
|
+
concat = [',','en']
|
140
|
+
elif lang=='en':
|
141
|
+
concat = [',','and']
|
142
|
+
elif lang=='es':
|
143
|
+
concat = [',','y','e']
|
144
|
+
elif lang=='it':
|
145
|
+
concat = [',','e','ed']
|
146
|
+
elif lang=='de':
|
147
|
+
concat = [',','und']
|
148
|
+
elif lang == 'fr':
|
149
|
+
concat=[',','et']
|
150
|
+
logging.debug(' Applying conjunctions:'+str(concat))
|
151
|
+
|
152
|
+
|
153
|
+
t = 0
|
154
|
+
while t < len(my_tokens):
|
155
|
+
if my_tokens[t].use_it and my_tokens[t].value!=0: ## Find the first one
|
156
|
+
#print 'FOUND ',my_tokens[t]
|
157
|
+
logging.debug(' Found token '+str(my_tokens[t]))
|
158
|
+
list_aux = my_tokens[t].list_ids
|
159
|
+
used = [t]
|
160
|
+
value_aux = my_tokens[t].value
|
161
|
+
my_tokens[t].use_it = False
|
162
|
+
#print 'Modified',my_tokens[t]
|
163
|
+
|
164
|
+
x = t+1
|
165
|
+
while True:
|
166
|
+
if x>=len(my_tokens):
|
167
|
+
break
|
168
|
+
|
169
|
+
if my_tokens[x].lemma in concat:
|
170
|
+
## list_aux += my_tokens[x].list_ids Dont use it as part of the OE
|
171
|
+
my_tokens[x].use_it = False
|
172
|
+
x+=1
|
173
|
+
elif (my_tokens[x].use_it and my_tokens[x].value!=0):
|
174
|
+
#print '\Also ',my_tokens[x]
|
175
|
+
logging.debug(' Found token '+str(my_tokens[x]))
|
176
|
+
list_aux += my_tokens[x].list_ids
|
177
|
+
|
178
|
+
used.append(x)
|
179
|
+
my_tokens[x].use_it = False
|
180
|
+
value_aux += my_tokens[x].value
|
181
|
+
x += 1
|
182
|
+
else:
|
183
|
+
break
|
184
|
+
#print 'OUT OF THE WHILE'
|
185
|
+
##The last one in the list used is the one accumulating all
|
186
|
+
|
187
|
+
last_pos = used[-1]
|
188
|
+
my_tokens[last_pos].value = value_aux
|
189
|
+
my_tokens[last_pos].list_ids = list_aux
|
190
|
+
my_tokens[last_pos].use_it = True
|
191
|
+
logging.debug(' Regenerating '+str(my_tokens[last_pos]))
|
192
|
+
t = x ## next token
|
193
|
+
#print
|
194
|
+
#print
|
195
|
+
t += 1
|
196
|
+
|
197
|
+
|
198
|
+
## Create OpinionExpression
|
199
|
+
my_opinion_exps = []
|
200
|
+
logging.debug(' Generating output')
|
201
|
+
for token in my_tokens:
|
202
|
+
if token.use_it and token.value != 0:
|
203
|
+
op_exp = OpinionExpression(token.list_ids,token.sentence,token.value)
|
204
|
+
my_opinion_exps.append(op_exp)
|
205
|
+
return my_opinion_exps
|
206
|
+
|
207
|
+
|
208
|
+
'''
|
209
|
+
def get_distance(id1, id2):
|
210
|
+
pos1 = int(id1[id1.find('_')+1:])
|
211
|
+
pos2 = int(id2[id2.find('_')+1:])
|
212
|
+
if pos1>pos2:
|
213
|
+
return pos1-pos2
|
214
|
+
else:
|
215
|
+
return pos2-pos1
|
216
|
+
'''
|
217
|
+
|
218
|
+
|
219
|
+
def obtain_holders(ops_exps,sentences,lang):
|
220
|
+
if lang=='nl':
|
221
|
+
holders = ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun']
|
222
|
+
elif lang=='en':
|
223
|
+
holders = ['i','we','he','she','they','it','you']
|
224
|
+
elif lang =='es':
|
225
|
+
holders = ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras']
|
226
|
+
elif lang =='it':
|
227
|
+
holders = ['io','tu','noi','voi','loro','lei','lui']
|
228
|
+
elif lang == 'de':
|
229
|
+
holders = ['ich','du','wir','ihr','sie','er']
|
230
|
+
elif lang == 'fr':
|
231
|
+
holders = ['je','tu','lui','elle','nous','vous','ils','elles']
|
232
|
+
|
233
|
+
logging.debug('Obtaining holders with list: '+str(holders))
|
234
|
+
|
235
|
+
for oe in ops_exps:
|
236
|
+
sent = oe.sentence
|
237
|
+
list_terms = sentences[str(sent)]
|
238
|
+
for lemma, pos, term_id in list_terms:
|
239
|
+
if lemma in holders:
|
240
|
+
oe.holder.append(term_id)
|
241
|
+
logging.debug(' Selected for '+str(oe)+' holder'+lemma+' '+term_id)
|
242
|
+
break
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
|
247
|
+
#This is specific for the basic version
|
248
|
+
def filter_candidates(candidates,ids_oe):
|
249
|
+
##filtered = [(lemma, pos,term_id) for (lemma,pos, term_id) in candidates if len(lemma)>=4 and term_id not in ids_oe]
|
250
|
+
filtered = [(lemma,pos,id) for (lemma,pos,id) in candidates if pos in ['N','R']]
|
251
|
+
return filtered
|
252
|
+
|
253
|
+
def obtain_targets_improved(ops_exps,sentences):
|
254
|
+
logging.debug(' Obtaining targets improved')
|
255
|
+
#print>>sys.stderr,'#'*40
|
256
|
+
#print>>sys.stderr,'#'*40
|
257
|
+
|
258
|
+
#print>>sys.stderr,'Beginning with obtain targets'
|
259
|
+
##sentences --> dict [str(numsent)] ==> list of (lemma, term)id
|
260
|
+
|
261
|
+
all_ids_in_oe = []
|
262
|
+
for oe in ops_exps:
|
263
|
+
all_ids_in_oe.extend(oe.ids)
|
264
|
+
#print>>sys.stderr,'All list of ids in oe',all_ids_in_oe
|
265
|
+
|
266
|
+
for oe in ops_exps:
|
267
|
+
#print>>sys.stderr,'\tOE:',oe
|
268
|
+
logging.debug(' OpExp: '+str(oe))
|
269
|
+
|
270
|
+
ids_in_oe = oe.ids
|
271
|
+
sent = oe.sentence
|
272
|
+
list_terms = sentences[str(sent)]
|
273
|
+
#print>>sys.stderr,'\t\tTerms in sent:',list_terms
|
274
|
+
|
275
|
+
###########################################
|
276
|
+
#First rule: noun to the right within maxdistance tokens
|
277
|
+
max_distance_right = 3
|
278
|
+
biggest_index = -1
|
279
|
+
for idx, (lemma,pos,term_id) in enumerate(list_terms):
|
280
|
+
if term_id in ids_in_oe:
|
281
|
+
biggest_index = idx
|
282
|
+
|
283
|
+
#print>>sys.stderr,'\t\tBI',biggest_index
|
284
|
+
if biggest_index+1 >= len(list_terms): ## is the last element and we shall skip it
|
285
|
+
#print>>sys.stderr,'\t\tNot possible to apply 1st rule'
|
286
|
+
pass
|
287
|
+
else:
|
288
|
+
candidates=list_terms[biggest_index+1:min(biggest_index+1+max_distance_right,len(list_terms))]
|
289
|
+
##Filter candidates
|
290
|
+
#print>>sys.stderr,'\t\tCandidates for right rule no filter',candidates
|
291
|
+
#oe.__candidates_right = [(lemma, term_id) for (lemma, term_id) in candidates if len(lemma)>=4 and term_id not in all_ids_in_oe]
|
292
|
+
oe.candidates_r = filter_candidates(candidates,all_ids_in_oe)
|
293
|
+
logging.debug(' Candidates filtered right'+str(oe.candidates_r))
|
294
|
+
#print>>sys.stderr,'\t\tCandidates for right rule no filter',oe.__candidates_right
|
295
|
+
|
296
|
+
######################################################################################
|
297
|
+
|
298
|
+
|
299
|
+
###########################################
|
300
|
+
max_distance_left = 3
|
301
|
+
smallest_index = 0
|
302
|
+
for idx,(lemma,pos,term_id) in enumerate(list_terms):
|
303
|
+
if term_id in ids_in_oe:
|
304
|
+
smallest_index = idx
|
305
|
+
break
|
306
|
+
#print>>sys.stderr,'Smalles index:',smallest_index
|
307
|
+
if smallest_index == 0:
|
308
|
+
#print>>sys.stderr,'\t\tNot possible to apply left rule'
|
309
|
+
pass
|
310
|
+
else:
|
311
|
+
candidates = list_terms[max(0,smallest_index-1-max_distance_left):smallest_index]
|
312
|
+
##Filter candidates
|
313
|
+
#print>>sys.stderr,'\t\tCandidates for left rule no filter',candidates
|
314
|
+
|
315
|
+
oe.candidates_l = filter_candidates(candidates,all_ids_in_oe)
|
316
|
+
logging.debug(' Candidates filtered left: '+str(oe.candidates_l))
|
317
|
+
|
318
|
+
######################################################################################
|
319
|
+
|
320
|
+
#print>>sys.stderr,'#'*40
|
321
|
+
#print>>sys.stderr,'#'*40
|
322
|
+
|
323
|
+
## filling or.target_ids
|
324
|
+
assigned_as_targets = []
|
325
|
+
|
326
|
+
# First we assing to all the first in the right, if any, and not assigned
|
327
|
+
logging.debug(' Applying first to the right rule')
|
328
|
+
for oe in ops_exps:
|
329
|
+
#print>>sys.stderr,'A ver ',oe
|
330
|
+
if len(oe.candidates_r) !=0:
|
331
|
+
lemma, pos, id = oe.candidates_r[0]
|
332
|
+
if id not in assigned_as_targets:
|
333
|
+
oe.target_ids.append(id)
|
334
|
+
###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
|
335
|
+
logging.debug(' OpExp '+str(oe)+' selected '+id)
|
336
|
+
#print>>sys.stderr,'Asignamos',id
|
337
|
+
|
338
|
+
logging.debug(' Applying most close rule')
|
339
|
+
for oe in ops_exps:
|
340
|
+
if len(oe.target_ids) == 0: # otherwise it's solved
|
341
|
+
intercalados_list = mix_lists([id for _,_,id in oe.candidates_r],[id for _,_,id in oe.candidates_l])
|
342
|
+
for id in intercalados_list:
|
343
|
+
if id not in assigned_as_targets:
|
344
|
+
oe.target_ids.append(id)
|
345
|
+
###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
|
346
|
+
logging.debug(' OpExp '+str(oe)+' selected '+id)
|
347
|
+
break
|
348
|
+
|
349
|
+
######## MAIN ROUTINE ############
|
350
|
+
|
351
|
+
## Check if we are reading from a pipeline
|
352
|
+
if sys.stdin.isatty():
|
353
|
+
print>>sys.stderr,'Input stream required.'
|
354
|
+
print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
|
355
|
+
sys.exit(-1)
|
356
|
+
########################################
|
357
|
+
|
358
|
+
logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s - %(message)s',level=logging.DEBUG)
|
359
|
+
|
360
|
+
## Processing the parameters
|
361
|
+
my_time_stamp = True
|
362
|
+
remove_opinions = True
|
363
|
+
opinion_strength = True
|
364
|
+
try:
|
365
|
+
opts, args = getopt.getopt(sys.argv[1:],"",["no-time","no-remove-opinions","no-opinion-strength"])
|
366
|
+
for opt, arg in opts:
|
367
|
+
if opt == "--no-time":
|
368
|
+
my_time_stamp = False
|
369
|
+
elif opt == "--no-remove-opinions":
|
370
|
+
remove_opinions = False
|
371
|
+
elif opt == "--no-opinion-strength":
|
372
|
+
opinion_strength = False
|
373
|
+
except getopt.GetoptError:
|
374
|
+
pass
|
375
|
+
#########################################
|
376
|
+
|
377
|
+
logging.debug('Include timestamp: '+str(my_time_stamp))
|
378
|
+
|
379
|
+
# Parsing the KAF file
|
380
|
+
try:
|
381
|
+
my_kaf_tree = KafParser(sys.stdin)
|
382
|
+
except Exception as e:
|
383
|
+
print>>sys.stderr,'Error parsing input'
|
384
|
+
print>>sys.stderr,'Stream input must be a valid KAF file'
|
385
|
+
print>>sys.stderr,'Error: ',str(e)
|
386
|
+
sys.exit(-1)
|
387
|
+
|
388
|
+
|
389
|
+
lang = my_kaf_tree.getLanguage()
|
390
|
+
## Creating data structure
|
391
|
+
sentences = defaultdict(list)
|
392
|
+
my_tokens = []
|
393
|
+
|
394
|
+
|
395
|
+
# CREATE the datastructure for the tokens
|
396
|
+
n=0
|
397
|
+
lemma_for_tid = {}
|
398
|
+
for term in my_kaf_tree.getTerms():
|
399
|
+
n+=1
|
400
|
+
term_id = term.getId()
|
401
|
+
lemma = term.getLemma()
|
402
|
+
lemma_for_tid[term_id] = lemma
|
403
|
+
kaf_pos = term.getPos()
|
404
|
+
#print>>sys.stderr,kaf_pos
|
405
|
+
list_span = term.get_list_span() ## List of token ids in the span layer of the term
|
406
|
+
sentiment = term.getSentiment()
|
407
|
+
polarity = sent_mod = None
|
408
|
+
if sentiment is not None:
|
409
|
+
polarity = sentiment.getPolarity()
|
410
|
+
sent_mod = sentiment.getSentimentModifier()
|
411
|
+
sentence = my_kaf_tree.getToken(list_span[0]).get('sent') ## The sentence of the first token element in span
|
412
|
+
my_tokens.append(MyToken(term_id,lemma,kaf_pos,polarity,sent_mod,sentence))
|
413
|
+
|
414
|
+
sentences[str(sentence)].append((lemma,kaf_pos,term_id))
|
415
|
+
#############################
|
416
|
+
|
417
|
+
logging.debug('Num terms loaded: '+str(n))
|
418
|
+
logging.debug('Num sentences: '+str(len(sentences)))
|
419
|
+
|
420
|
+
|
421
|
+
logging.debug('Obtaining opinion expressions')
|
422
|
+
my_ops_exps = obtain_opinion_expressions(my_tokens,lang)
|
423
|
+
print>>sys.stderr,my_ops_exps
|
424
|
+
|
425
|
+
logging.debug('Obtaining targets')
|
426
|
+
obtain_targets_improved(my_ops_exps,sentences)
|
427
|
+
|
428
|
+
|
429
|
+
logging.debug('Obtaining holders')
|
430
|
+
obtain_holders(my_ops_exps,sentences,lang)
|
431
|
+
|
432
|
+
|
433
|
+
|
434
|
+
|
435
|
+
## Create the elements
|
436
|
+
logging.debug('Generating KAF output')
|
437
|
+
|
438
|
+
if remove_opinions:
|
439
|
+
my_kaf_tree.remove_opinion_layer()
|
440
|
+
|
441
|
+
for oe in my_ops_exps:
|
442
|
+
op_ele = etree.Element('opinion')
|
443
|
+
|
444
|
+
## Holder
|
445
|
+
if len(oe.holder)!=0:
|
446
|
+
oe.holder.sort()
|
447
|
+
c = ' '.join(lemma_for_tid[tid] for tid in oe.holder)
|
448
|
+
op_hol = etree.Element('opinion_holder')
|
449
|
+
op_hol.append(etree.Comment(c))
|
450
|
+
op_ele.append(op_hol)
|
451
|
+
span_op_hol = etree.Element('span')
|
452
|
+
op_hol.append(span_op_hol)
|
453
|
+
for id in oe.holder:
|
454
|
+
span_op_hol.append(etree.Element('target',attrib={'id':id}))
|
455
|
+
|
456
|
+
## Target
|
457
|
+
op_tar = etree.Element('opinion_target')
|
458
|
+
op_ele.append(op_tar)
|
459
|
+
|
460
|
+
|
461
|
+
if len(oe.target_ids)!=0: ## if there are no targets, there is no opinion eleemnt
|
462
|
+
oe.target_ids.sort()
|
463
|
+
c = ' '.join(lemma_for_tid[tid] for tid in oe.target_ids)
|
464
|
+
op_tar.append(etree.Comment(c))
|
465
|
+
span_op_tar = etree.Element('span')
|
466
|
+
op_tar.append(span_op_tar)
|
467
|
+
for id in oe.target_ids:
|
468
|
+
span_op_tar.append(etree.Element('target',attrib={'id':id}))
|
469
|
+
|
470
|
+
#Expression
|
471
|
+
if oe.value > 0: pol = 'positive'
|
472
|
+
elif oe.value < 0: pol = 'negative'
|
473
|
+
else: pol = 'neutral'
|
474
|
+
|
475
|
+
op_exp = etree.Element('opinion_expression')
|
476
|
+
op_exp.set('polarity',pol)
|
477
|
+
if opinion_strength:
|
478
|
+
op_exp.set('strength',str(oe.value))
|
479
|
+
|
480
|
+
op_ele.append(op_exp)
|
481
|
+
oe.ids.sort()
|
482
|
+
c = ' '.join(lemma_for_tid[tid] for tid in oe.ids)
|
483
|
+
op_exp.append(etree.Comment(c))
|
484
|
+
span_exp = etree.Element('span')
|
485
|
+
op_exp.append(span_exp)
|
486
|
+
for id in oe.ids:
|
487
|
+
span_exp.append(etree.Element('target',attrib={'id':id}))
|
488
|
+
|
489
|
+
##Append the op_ele to the opinions layer
|
490
|
+
my_kaf_tree.addElementToLayer('opinions', op_ele)
|
491
|
+
|
492
|
+
|
493
|
+
my_kaf_tree.addLinguisticProcessor('Basic opinion detector with Pos','1.0','opinions', my_time_stamp)
|
494
|
+
my_kaf_tree.saveToFile(sys.stdout)
|
495
|
+
logging.debug('Process finished')
|
496
|
+
|
497
|
+
|
498
|
+
|
499
|
+
|