opener-opinion-detector-basic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +30 -0
- data/bin/opinion-detector-basic +19 -0
- data/bin/opinion-detector-basic-server +10 -0
- data/config.ru +4 -0
- data/core/opinion_detector_basic_multi.py +499 -0
- data/core/packages/KafNafParser-1.3.tar.gz +0 -0
- data/core/packages/VUA_pylib-1.4.tar.gz +0 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
- data/core/vendor/src/crfsuite/AUTHORS +1 -0
- data/core/vendor/src/crfsuite/COPYING +27 -0
- data/core/vendor/src/crfsuite/ChangeLog +103 -0
- data/core/vendor/src/crfsuite/INSTALL +236 -0
- data/core/vendor/src/crfsuite/Makefile.am +19 -0
- data/core/vendor/src/crfsuite/Makefile.in +783 -0
- data/core/vendor/src/crfsuite/README +183 -0
- data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
- data/core/vendor/src/crfsuite/autogen.sh +38 -0
- data/core/vendor/src/crfsuite/compile +143 -0
- data/core/vendor/src/crfsuite/config.guess +1502 -0
- data/core/vendor/src/crfsuite/config.h.in +198 -0
- data/core/vendor/src/crfsuite/config.sub +1714 -0
- data/core/vendor/src/crfsuite/configure +14273 -0
- data/core/vendor/src/crfsuite/configure.in +149 -0
- data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
- data/core/vendor/src/crfsuite/depcomp +630 -0
- data/core/vendor/src/crfsuite/example/chunking.py +49 -0
- data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
- data/core/vendor/src/crfsuite/example/ner.py +270 -0
- data/core/vendor/src/crfsuite/example/pos.py +78 -0
- data/core/vendor/src/crfsuite/example/template.py +88 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
- data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
- data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
- data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
- data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
- data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
- data/core/vendor/src/crfsuite/frontend/main.c +137 -0
- data/core/vendor/src/crfsuite/frontend/option.c +93 -0
- data/core/vendor/src/crfsuite/frontend/option.h +86 -0
- data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
- data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
- data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
- data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
- data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
- data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
- data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
- data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
- data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
- data/core/vendor/src/crfsuite/include/os.h +61 -0
- data/core/vendor/src/crfsuite/install-sh +520 -0
- data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
- data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
- data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
- data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
- data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
- data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
- data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
- data/core/vendor/src/crfsuite/missing +376 -0
- data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
- data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
- data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
- data/core/vendor/src/crfsuite/swig/export.i +32 -0
- data/core/vendor/src/crfsuite/swig/python/README +92 -0
- data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
- data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
- data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
- data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
- data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
- data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
- data/core/vendor/src/liblbfgs/AUTHORS +1 -0
- data/core/vendor/src/liblbfgs/COPYING +22 -0
- data/core/vendor/src/liblbfgs/ChangeLog +120 -0
- data/core/vendor/src/liblbfgs/INSTALL +231 -0
- data/core/vendor/src/liblbfgs/Makefile.am +10 -0
- data/core/vendor/src/liblbfgs/Makefile.in +638 -0
- data/core/vendor/src/liblbfgs/NEWS +0 -0
- data/core/vendor/src/liblbfgs/README +71 -0
- data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
- data/core/vendor/src/liblbfgs/autogen.sh +38 -0
- data/core/vendor/src/liblbfgs/config.guess +1411 -0
- data/core/vendor/src/liblbfgs/config.h.in +64 -0
- data/core/vendor/src/liblbfgs/config.sub +1500 -0
- data/core/vendor/src/liblbfgs/configure +21146 -0
- data/core/vendor/src/liblbfgs/configure.in +107 -0
- data/core/vendor/src/liblbfgs/depcomp +522 -0
- data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
- data/core/vendor/src/liblbfgs/install-sh +322 -0
- data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
- data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
- data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
- data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
- data/core/vendor/src/liblbfgs/missing +353 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
- data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
- data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
- data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
- data/core/vendor/src/svm_light/LICENSE.txt +59 -0
- data/core/vendor/src/svm_light/Makefile +105 -0
- data/core/vendor/src/svm_light/kernel.h +40 -0
- data/core/vendor/src/svm_light/svm_classify.c +197 -0
- data/core/vendor/src/svm_light/svm_common.c +985 -0
- data/core/vendor/src/svm_light/svm_common.h +301 -0
- data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
- data/core/vendor/src/svm_light/svm_learn.c +4147 -0
- data/core/vendor/src/svm_light/svm_learn.h +169 -0
- data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
- data/core/vendor/src/svm_light/svm_loqo.c +211 -0
- data/ext/hack/Rakefile +17 -0
- data/ext/hack/support.rb +88 -0
- data/lib/opener/opinion_detector_basic.rb +91 -0
- data/lib/opener/opinion_detector_basic/public/markdown.css +284 -0
- data/lib/opener/opinion_detector_basic/server.rb +16 -0
- data/lib/opener/opinion_detector_basic/version.rb +5 -0
- data/lib/opener/opinion_detector_basic/views/index.erb +97 -0
- data/lib/opener/opinion_detector_basic/views/result.erb +15 -0
- data/opener-opinion-detector-basic.gemspec +36 -0
- data/pre_build_requirements.txt +1 -0
- metadata +309 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: cc6dd6c71396853ddb39ff898599ad4c3f466401
|
|
4
|
+
data.tar.gz: c820abad54167482947c7a56d78cba5331482957
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: ee6555c6dee3540ed4b7a7817e106cfb459281f159c60d73b2af548b9bc19b967ce70bc52e8bac991eabdab6b74a6d30606f7672c339d75c999bf2666a8a66ec
|
|
7
|
+
data.tar.gz: 72fc4d27ce5b98fab608880492a2c7b135e24542eb9b2cb26b3bd4c32008095f966ec944588d30aaeb278c265bdb65bd22688dd43649df7cd16d9ea0e99ece55
|
data/README.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Opinion Detector Basic
|
|
2
|
+
======================
|
|
3
|
+
|
|
4
|
+
This module implements a opinion detector for English (also works for Dutch and
|
|
5
|
+
German). The language is determined by the "xml:lang" attribut in the input KAF
|
|
6
|
+
file. Depending on the value of this attribute, the corresponding lexicon will
|
|
7
|
+
be loaded. This module detects three elements of the opinions:
|
|
8
|
+
|
|
9
|
+
* Expression: the actual opinion expression
|
|
10
|
+
* Target: about what is the previous expression
|
|
11
|
+
* Holder: who is stating that expression
|
|
12
|
+
|
|
13
|
+
Requirements
|
|
14
|
+
-----------
|
|
15
|
+
* VUKafParserPy: parser in python for KAF files
|
|
16
|
+
* lxml: library for processing xml in python
|
|
17
|
+
|
|
18
|
+
Usage
|
|
19
|
+
----
|
|
20
|
+
|
|
21
|
+
The input KAF file has to be annotated with at least the term layer, with
|
|
22
|
+
polarity information. Correct input files for this module are the output KAF
|
|
23
|
+
files from the polarity tagger module
|
|
24
|
+
|
|
25
|
+
To tag an input KAF file example.kaf with opinions you can run:
|
|
26
|
+
|
|
27
|
+
$ cat example.with.polaritieskaf | core/opinion_detector_basic_multi.py > output.with.opinions.kaf
|
|
28
|
+
|
|
29
|
+
The output will the input KAF file extended with the opinion layer.
|
|
30
|
+
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require_relative '../lib/opener/opinion_detector_basic'
|
|
4
|
+
|
|
5
|
+
# STDIN.tty? returns `false` if data is being piped into the current process.
|
|
6
|
+
if STDIN.tty?
|
|
7
|
+
input = nil
|
|
8
|
+
else
|
|
9
|
+
input = STDIN.read
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
kernel = Opener::OpinionDetectorBasic.new(:args => ARGV)
|
|
13
|
+
stdout, stderr, process = kernel.run(input)
|
|
14
|
+
|
|
15
|
+
if process.success?
|
|
16
|
+
puts stdout
|
|
17
|
+
else
|
|
18
|
+
abort stderr
|
|
19
|
+
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'rack'
|
|
4
|
+
|
|
5
|
+
# Without calling `Rack::Server#options` manually the CLI arguments will never
|
|
6
|
+
# be passed, thus the application can't be specified as a constructor argument.
|
|
7
|
+
server = Rack::Server.new
|
|
8
|
+
server.options[:config] = File.expand_path('../../config.ru', __FILE__)
|
|
9
|
+
|
|
10
|
+
server.start
|
data/config.ru
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import getopt
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
this_folder = os.path.dirname(os.path.realpath(__file__))
|
|
8
|
+
|
|
9
|
+
# This updates the load path to ensure that the local site-packages directory
|
|
10
|
+
# can be used to load packages (e.g. a locally installed copy of lxml).
|
|
11
|
+
sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
|
|
12
|
+
|
|
13
|
+
from VUKafParserPy import KafParser
|
|
14
|
+
from collections import defaultdict
|
|
15
|
+
import operator
|
|
16
|
+
import pprint
|
|
17
|
+
import lxml
|
|
18
|
+
from lxml import etree
|
|
19
|
+
import logging
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def mix_lists(l1,l2):
|
|
24
|
+
newl=[]
|
|
25
|
+
min_l = min(len(l1),len(l2))
|
|
26
|
+
for x in range(min_l):
|
|
27
|
+
newl.append(l1[x])
|
|
28
|
+
newl.append(l2[x])
|
|
29
|
+
|
|
30
|
+
if len(l1)>len(l2):
|
|
31
|
+
newl.extend(l1[min_l:])
|
|
32
|
+
elif len(l2)>len(l1):
|
|
33
|
+
newl.extend(l2[min_l:])
|
|
34
|
+
return newl
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class OpinionExpression:
|
|
38
|
+
def __init__(self,spans,sentence,value):
|
|
39
|
+
self.ids = spans
|
|
40
|
+
self.sentence = sentence
|
|
41
|
+
self.value = value
|
|
42
|
+
self.target_ids = []
|
|
43
|
+
self.candidates_r=[]
|
|
44
|
+
self.candidates_l=[]
|
|
45
|
+
self.holder = []
|
|
46
|
+
|
|
47
|
+
def __repr__(self):
|
|
48
|
+
r='Ids:'+'#'.join(self.ids)+' Sent:'+self.sentence+' Value:'+str(self.value)+ ' Target:'+'#'.join(self.target_ids)+'\n'
|
|
49
|
+
r+='Right cand: '+str(self.candidates_r)+'\n'
|
|
50
|
+
r+='Left cand: '+str(self.candidates_l)+'\n'
|
|
51
|
+
return r
|
|
52
|
+
|
|
53
|
+
class MyToken:
|
|
54
|
+
def __init__(self,id,lemma,pos,polarity,sent_mod,sent):
|
|
55
|
+
self.id = id
|
|
56
|
+
self.lemma = lemma
|
|
57
|
+
self.pos = pos
|
|
58
|
+
self.polarity = polarity
|
|
59
|
+
self.sent_mod = sent_mod
|
|
60
|
+
self.sentence = sent
|
|
61
|
+
self.use_it = True
|
|
62
|
+
self.list_ids = [id]
|
|
63
|
+
self.value = 0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
if polarity == 'positive':
|
|
67
|
+
self.value = 1
|
|
68
|
+
elif polarity == 'negative':
|
|
69
|
+
self.value = -1
|
|
70
|
+
|
|
71
|
+
if sent_mod == 'intensifier':
|
|
72
|
+
self.value = 2
|
|
73
|
+
elif sent_mod == 'shifter':
|
|
74
|
+
self.value = -1
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def isNegator(self):
|
|
78
|
+
return self.sent_mod == 'shifter'
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def isIntensifier(self):
|
|
83
|
+
return self.sent_mod == 'intensifier'
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def is_opinion_expression(self):
|
|
87
|
+
return self.use_it and self.polarity is not None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def __repr__(self):
|
|
91
|
+
if self.use_it:
|
|
92
|
+
return self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
|
|
93
|
+
else:
|
|
94
|
+
return '\t'+self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def obtain_opinion_expressions(tokens,lang='nl'):
|
|
99
|
+
logging.debug(' Obtaining opinion expressions')
|
|
100
|
+
my_tokens = tokens[:]
|
|
101
|
+
|
|
102
|
+
accumulate_several_modifiers = True
|
|
103
|
+
apply_modifiers = True
|
|
104
|
+
apply_conjunctions = True
|
|
105
|
+
|
|
106
|
+
## Acumulate doble/triple intensifiers or negators
|
|
107
|
+
if accumulate_several_modifiers:
|
|
108
|
+
logging.debug(' Accumulating modifiers')
|
|
109
|
+
t = 0
|
|
110
|
+
while t < len(my_tokens):
|
|
111
|
+
if my_tokens[t].isNegator() or my_tokens[t].isIntensifier():
|
|
112
|
+
if t+1 < len(my_tokens) and ( my_tokens[t+1].isNegator() or my_tokens[t+1].isIntensifier()):
|
|
113
|
+
## There are 2 negators/intensifiers next to each other
|
|
114
|
+
## The first one is deactivated and the second one is modified
|
|
115
|
+
my_tokens[t].use_it = False
|
|
116
|
+
my_tokens[t+1].value *= my_tokens[t].value
|
|
117
|
+
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
|
118
|
+
logging.debug(' Accucumating '+'-'.join(my_tokens[t+1].list_ids))
|
|
119
|
+
t+=1
|
|
120
|
+
###########################################
|
|
121
|
+
|
|
122
|
+
##Apply intensifiers/negators over the next elements
|
|
123
|
+
if apply_modifiers:
|
|
124
|
+
logging.debug(' Applying modifiers')
|
|
125
|
+
t = 0
|
|
126
|
+
while t < len(my_tokens):
|
|
127
|
+
if my_tokens[t].use_it and (my_tokens[t].isNegator() or my_tokens[t].isIntensifier()):
|
|
128
|
+
## Try to modify the next token:
|
|
129
|
+
if t+1<len(my_tokens):
|
|
130
|
+
my_tokens[t+1].value *= my_tokens[t].value
|
|
131
|
+
my_tokens[t+1].list_ids += my_tokens[t].list_ids
|
|
132
|
+
my_tokens[t].use_it = False
|
|
133
|
+
logging.debug(' Applied modifier over '+'-'.join(my_tokens[t+1].list_ids))
|
|
134
|
+
t += 1
|
|
135
|
+
###########################################
|
|
136
|
+
|
|
137
|
+
if apply_conjunctions:
|
|
138
|
+
if lang=='nl':
|
|
139
|
+
concat = [',','en']
|
|
140
|
+
elif lang=='en':
|
|
141
|
+
concat = [',','and']
|
|
142
|
+
elif lang=='es':
|
|
143
|
+
concat = [',','y','e']
|
|
144
|
+
elif lang=='it':
|
|
145
|
+
concat = [',','e','ed']
|
|
146
|
+
elif lang=='de':
|
|
147
|
+
concat = [',','und']
|
|
148
|
+
elif lang == 'fr':
|
|
149
|
+
concat=[',','et']
|
|
150
|
+
logging.debug(' Applying conjunctions:'+str(concat))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
t = 0
|
|
154
|
+
while t < len(my_tokens):
|
|
155
|
+
if my_tokens[t].use_it and my_tokens[t].value!=0: ## Find the first one
|
|
156
|
+
#print 'FOUND ',my_tokens[t]
|
|
157
|
+
logging.debug(' Found token '+str(my_tokens[t]))
|
|
158
|
+
list_aux = my_tokens[t].list_ids
|
|
159
|
+
used = [t]
|
|
160
|
+
value_aux = my_tokens[t].value
|
|
161
|
+
my_tokens[t].use_it = False
|
|
162
|
+
#print 'Modified',my_tokens[t]
|
|
163
|
+
|
|
164
|
+
x = t+1
|
|
165
|
+
while True:
|
|
166
|
+
if x>=len(my_tokens):
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
if my_tokens[x].lemma in concat:
|
|
170
|
+
## list_aux += my_tokens[x].list_ids Dont use it as part of the OE
|
|
171
|
+
my_tokens[x].use_it = False
|
|
172
|
+
x+=1
|
|
173
|
+
elif (my_tokens[x].use_it and my_tokens[x].value!=0):
|
|
174
|
+
#print '\Also ',my_tokens[x]
|
|
175
|
+
logging.debug(' Found token '+str(my_tokens[x]))
|
|
176
|
+
list_aux += my_tokens[x].list_ids
|
|
177
|
+
|
|
178
|
+
used.append(x)
|
|
179
|
+
my_tokens[x].use_it = False
|
|
180
|
+
value_aux += my_tokens[x].value
|
|
181
|
+
x += 1
|
|
182
|
+
else:
|
|
183
|
+
break
|
|
184
|
+
#print 'OUT OF THE WHILE'
|
|
185
|
+
##The last one in the list used is the one accumulating all
|
|
186
|
+
|
|
187
|
+
last_pos = used[-1]
|
|
188
|
+
my_tokens[last_pos].value = value_aux
|
|
189
|
+
my_tokens[last_pos].list_ids = list_aux
|
|
190
|
+
my_tokens[last_pos].use_it = True
|
|
191
|
+
logging.debug(' Regenerating '+str(my_tokens[last_pos]))
|
|
192
|
+
t = x ## next token
|
|
193
|
+
#print
|
|
194
|
+
#print
|
|
195
|
+
t += 1
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
## Create OpinionExpression
|
|
199
|
+
my_opinion_exps = []
|
|
200
|
+
logging.debug(' Generating output')
|
|
201
|
+
for token in my_tokens:
|
|
202
|
+
if token.use_it and token.value != 0:
|
|
203
|
+
op_exp = OpinionExpression(token.list_ids,token.sentence,token.value)
|
|
204
|
+
my_opinion_exps.append(op_exp)
|
|
205
|
+
return my_opinion_exps
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
'''
|
|
209
|
+
def get_distance(id1, id2):
|
|
210
|
+
pos1 = int(id1[id1.find('_')+1:])
|
|
211
|
+
pos2 = int(id2[id2.find('_')+1:])
|
|
212
|
+
if pos1>pos2:
|
|
213
|
+
return pos1-pos2
|
|
214
|
+
else:
|
|
215
|
+
return pos2-pos1
|
|
216
|
+
'''
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def obtain_holders(ops_exps,sentences,lang):
|
|
220
|
+
if lang=='nl':
|
|
221
|
+
holders = ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun']
|
|
222
|
+
elif lang=='en':
|
|
223
|
+
holders = ['i','we','he','she','they','it','you']
|
|
224
|
+
elif lang =='es':
|
|
225
|
+
holders = ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras']
|
|
226
|
+
elif lang =='it':
|
|
227
|
+
holders = ['io','tu','noi','voi','loro','lei','lui']
|
|
228
|
+
elif lang == 'de':
|
|
229
|
+
holders = ['ich','du','wir','ihr','sie','er']
|
|
230
|
+
elif lang == 'fr':
|
|
231
|
+
holders = ['je','tu','lui','elle','nous','vous','ils','elles']
|
|
232
|
+
|
|
233
|
+
logging.debug('Obtaining holders with list: '+str(holders))
|
|
234
|
+
|
|
235
|
+
for oe in ops_exps:
|
|
236
|
+
sent = oe.sentence
|
|
237
|
+
list_terms = sentences[str(sent)]
|
|
238
|
+
for lemma, pos, term_id in list_terms:
|
|
239
|
+
if lemma in holders:
|
|
240
|
+
oe.holder.append(term_id)
|
|
241
|
+
logging.debug(' Selected for '+str(oe)+' holder'+lemma+' '+term_id)
|
|
242
|
+
break
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
#This is specific for the basic version
|
|
248
|
+
def filter_candidates(candidates,ids_oe):
|
|
249
|
+
##filtered = [(lemma, pos,term_id) for (lemma,pos, term_id) in candidates if len(lemma)>=4 and term_id not in ids_oe]
|
|
250
|
+
filtered = [(lemma,pos,id) for (lemma,pos,id) in candidates if pos in ['N','R']]
|
|
251
|
+
return filtered
|
|
252
|
+
|
|
253
|
+
def obtain_targets_improved(ops_exps,sentences):
|
|
254
|
+
logging.debug(' Obtaining targets improved')
|
|
255
|
+
#print>>sys.stderr,'#'*40
|
|
256
|
+
#print>>sys.stderr,'#'*40
|
|
257
|
+
|
|
258
|
+
#print>>sys.stderr,'Beginning with obtain targets'
|
|
259
|
+
##sentences --> dict [str(numsent)] ==> list of (lemma, term)id
|
|
260
|
+
|
|
261
|
+
all_ids_in_oe = []
|
|
262
|
+
for oe in ops_exps:
|
|
263
|
+
all_ids_in_oe.extend(oe.ids)
|
|
264
|
+
#print>>sys.stderr,'All list of ids in oe',all_ids_in_oe
|
|
265
|
+
|
|
266
|
+
for oe in ops_exps:
|
|
267
|
+
#print>>sys.stderr,'\tOE:',oe
|
|
268
|
+
logging.debug(' OpExp: '+str(oe))
|
|
269
|
+
|
|
270
|
+
ids_in_oe = oe.ids
|
|
271
|
+
sent = oe.sentence
|
|
272
|
+
list_terms = sentences[str(sent)]
|
|
273
|
+
#print>>sys.stderr,'\t\tTerms in sent:',list_terms
|
|
274
|
+
|
|
275
|
+
###########################################
|
|
276
|
+
#First rule: noun to the right within maxdistance tokens
|
|
277
|
+
max_distance_right = 3
|
|
278
|
+
biggest_index = -1
|
|
279
|
+
for idx, (lemma,pos,term_id) in enumerate(list_terms):
|
|
280
|
+
if term_id in ids_in_oe:
|
|
281
|
+
biggest_index = idx
|
|
282
|
+
|
|
283
|
+
#print>>sys.stderr,'\t\tBI',biggest_index
|
|
284
|
+
if biggest_index+1 >= len(list_terms): ## is the last element and we shall skip it
|
|
285
|
+
#print>>sys.stderr,'\t\tNot possible to apply 1st rule'
|
|
286
|
+
pass
|
|
287
|
+
else:
|
|
288
|
+
candidates=list_terms[biggest_index+1:min(biggest_index+1+max_distance_right,len(list_terms))]
|
|
289
|
+
##Filter candidates
|
|
290
|
+
#print>>sys.stderr,'\t\tCandidates for right rule no filter',candidates
|
|
291
|
+
#oe.__candidates_right = [(lemma, term_id) for (lemma, term_id) in candidates if len(lemma)>=4 and term_id not in all_ids_in_oe]
|
|
292
|
+
oe.candidates_r = filter_candidates(candidates,all_ids_in_oe)
|
|
293
|
+
logging.debug(' Candidates filtered right'+str(oe.candidates_r))
|
|
294
|
+
#print>>sys.stderr,'\t\tCandidates for right rule no filter',oe.__candidates_right
|
|
295
|
+
|
|
296
|
+
######################################################################################
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
###########################################
|
|
300
|
+
max_distance_left = 3
|
|
301
|
+
smallest_index = 0
|
|
302
|
+
for idx,(lemma,pos,term_id) in enumerate(list_terms):
|
|
303
|
+
if term_id in ids_in_oe:
|
|
304
|
+
smallest_index = idx
|
|
305
|
+
break
|
|
306
|
+
#print>>sys.stderr,'Smalles index:',smallest_index
|
|
307
|
+
if smallest_index == 0:
|
|
308
|
+
#print>>sys.stderr,'\t\tNot possible to apply left rule'
|
|
309
|
+
pass
|
|
310
|
+
else:
|
|
311
|
+
candidates = list_terms[max(0,smallest_index-1-max_distance_left):smallest_index]
|
|
312
|
+
##Filter candidates
|
|
313
|
+
#print>>sys.stderr,'\t\tCandidates for left rule no filter',candidates
|
|
314
|
+
|
|
315
|
+
oe.candidates_l = filter_candidates(candidates,all_ids_in_oe)
|
|
316
|
+
logging.debug(' Candidates filtered left: '+str(oe.candidates_l))
|
|
317
|
+
|
|
318
|
+
######################################################################################
|
|
319
|
+
|
|
320
|
+
#print>>sys.stderr,'#'*40
|
|
321
|
+
#print>>sys.stderr,'#'*40
|
|
322
|
+
|
|
323
|
+
## filling or.target_ids
|
|
324
|
+
assigned_as_targets = []
|
|
325
|
+
|
|
326
|
+
# First we assing to all the first in the right, if any, and not assigned
|
|
327
|
+
logging.debug(' Applying first to the right rule')
|
|
328
|
+
for oe in ops_exps:
|
|
329
|
+
#print>>sys.stderr,'A ver ',oe
|
|
330
|
+
if len(oe.candidates_r) !=0:
|
|
331
|
+
lemma, pos, id = oe.candidates_r[0]
|
|
332
|
+
if id not in assigned_as_targets:
|
|
333
|
+
oe.target_ids.append(id)
|
|
334
|
+
###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
|
|
335
|
+
logging.debug(' OpExp '+str(oe)+' selected '+id)
|
|
336
|
+
#print>>sys.stderr,'Asignamos',id
|
|
337
|
+
|
|
338
|
+
logging.debug(' Applying most close rule')
|
|
339
|
+
for oe in ops_exps:
|
|
340
|
+
if len(oe.target_ids) == 0: # otherwise it's solved
|
|
341
|
+
intercalados_list = mix_lists([id for _,_,id in oe.candidates_r],[id for _,_,id in oe.candidates_l])
|
|
342
|
+
for id in intercalados_list:
|
|
343
|
+
if id not in assigned_as_targets:
|
|
344
|
+
oe.target_ids.append(id)
|
|
345
|
+
###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
|
|
346
|
+
logging.debug(' OpExp '+str(oe)+' selected '+id)
|
|
347
|
+
break
|
|
348
|
+
|
|
349
|
+
######## MAIN ROUTINE ############
|
|
350
|
+
|
|
351
|
+
## Check if we are reading from a pipeline
|
|
352
|
+
if sys.stdin.isatty():
|
|
353
|
+
print>>sys.stderr,'Input stream required.'
|
|
354
|
+
print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
|
|
355
|
+
sys.exit(-1)
|
|
356
|
+
########################################
|
|
357
|
+
|
|
358
|
+
logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s - %(message)s',level=logging.DEBUG)
|
|
359
|
+
|
|
360
|
+
## Processing the parameters
|
|
361
|
+
my_time_stamp = True
|
|
362
|
+
remove_opinions = True
|
|
363
|
+
opinion_strength = True
|
|
364
|
+
try:
|
|
365
|
+
opts, args = getopt.getopt(sys.argv[1:],"",["no-time","no-remove-opinions","no-opinion-strength"])
|
|
366
|
+
for opt, arg in opts:
|
|
367
|
+
if opt == "--no-time":
|
|
368
|
+
my_time_stamp = False
|
|
369
|
+
elif opt == "--no-remove-opinions":
|
|
370
|
+
remove_opinions = False
|
|
371
|
+
elif opt == "--no-opinion-strength":
|
|
372
|
+
opinion_strength = False
|
|
373
|
+
except getopt.GetoptError:
|
|
374
|
+
pass
|
|
375
|
+
#########################################
|
|
376
|
+
|
|
377
|
+
logging.debug('Include timestamp: '+str(my_time_stamp))
|
|
378
|
+
|
|
379
|
+
# Parsing the KAF file
|
|
380
|
+
try:
|
|
381
|
+
my_kaf_tree = KafParser(sys.stdin)
|
|
382
|
+
except Exception as e:
|
|
383
|
+
print>>sys.stderr,'Error parsing input'
|
|
384
|
+
print>>sys.stderr,'Stream input must be a valid KAF file'
|
|
385
|
+
print>>sys.stderr,'Error: ',str(e)
|
|
386
|
+
sys.exit(-1)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
lang = my_kaf_tree.getLanguage()
|
|
390
|
+
## Creating data structure
|
|
391
|
+
sentences = defaultdict(list)
|
|
392
|
+
my_tokens = []
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
# CREATE the datastructure for the tokens
|
|
396
|
+
n=0
|
|
397
|
+
lemma_for_tid = {}
|
|
398
|
+
for term in my_kaf_tree.getTerms():
|
|
399
|
+
n+=1
|
|
400
|
+
term_id = term.getId()
|
|
401
|
+
lemma = term.getLemma()
|
|
402
|
+
lemma_for_tid[term_id] = lemma
|
|
403
|
+
kaf_pos = term.getPos()
|
|
404
|
+
#print>>sys.stderr,kaf_pos
|
|
405
|
+
list_span = term.get_list_span() ## List of token ids in the span layer of the term
|
|
406
|
+
sentiment = term.getSentiment()
|
|
407
|
+
polarity = sent_mod = None
|
|
408
|
+
if sentiment is not None:
|
|
409
|
+
polarity = sentiment.getPolarity()
|
|
410
|
+
sent_mod = sentiment.getSentimentModifier()
|
|
411
|
+
sentence = my_kaf_tree.getToken(list_span[0]).get('sent') ## The sentence of the first token element in span
|
|
412
|
+
my_tokens.append(MyToken(term_id,lemma,kaf_pos,polarity,sent_mod,sentence))
|
|
413
|
+
|
|
414
|
+
sentences[str(sentence)].append((lemma,kaf_pos,term_id))
|
|
415
|
+
#############################
|
|
416
|
+
|
|
417
|
+
logging.debug('Num terms loaded: '+str(n))
|
|
418
|
+
logging.debug('Num sentences: '+str(len(sentences)))
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
logging.debug('Obtaining opinion expressions')
|
|
422
|
+
my_ops_exps = obtain_opinion_expressions(my_tokens,lang)
|
|
423
|
+
print>>sys.stderr,my_ops_exps
|
|
424
|
+
|
|
425
|
+
logging.debug('Obtaining targets')
|
|
426
|
+
obtain_targets_improved(my_ops_exps,sentences)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
logging.debug('Obtaining holders')
|
|
430
|
+
obtain_holders(my_ops_exps,sentences,lang)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
## Create the elements
|
|
436
|
+
logging.debug('Generating KAF output')
|
|
437
|
+
|
|
438
|
+
if remove_opinions:
|
|
439
|
+
my_kaf_tree.remove_opinion_layer()
|
|
440
|
+
|
|
441
|
+
for oe in my_ops_exps:
|
|
442
|
+
op_ele = etree.Element('opinion')
|
|
443
|
+
|
|
444
|
+
## Holder
|
|
445
|
+
if len(oe.holder)!=0:
|
|
446
|
+
oe.holder.sort()
|
|
447
|
+
c = ' '.join(lemma_for_tid[tid] for tid in oe.holder)
|
|
448
|
+
op_hol = etree.Element('opinion_holder')
|
|
449
|
+
op_hol.append(etree.Comment(c))
|
|
450
|
+
op_ele.append(op_hol)
|
|
451
|
+
span_op_hol = etree.Element('span')
|
|
452
|
+
op_hol.append(span_op_hol)
|
|
453
|
+
for id in oe.holder:
|
|
454
|
+
span_op_hol.append(etree.Element('target',attrib={'id':id}))
|
|
455
|
+
|
|
456
|
+
## Target
|
|
457
|
+
op_tar = etree.Element('opinion_target')
|
|
458
|
+
op_ele.append(op_tar)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
if len(oe.target_ids)!=0: ## if there are no targets, there is no opinion eleemnt
|
|
462
|
+
oe.target_ids.sort()
|
|
463
|
+
c = ' '.join(lemma_for_tid[tid] for tid in oe.target_ids)
|
|
464
|
+
op_tar.append(etree.Comment(c))
|
|
465
|
+
span_op_tar = etree.Element('span')
|
|
466
|
+
op_tar.append(span_op_tar)
|
|
467
|
+
for id in oe.target_ids:
|
|
468
|
+
span_op_tar.append(etree.Element('target',attrib={'id':id}))
|
|
469
|
+
|
|
470
|
+
#Expression
|
|
471
|
+
if oe.value > 0: pol = 'positive'
|
|
472
|
+
elif oe.value < 0: pol = 'negative'
|
|
473
|
+
else: pol = 'neutral'
|
|
474
|
+
|
|
475
|
+
op_exp = etree.Element('opinion_expression')
|
|
476
|
+
op_exp.set('polarity',pol)
|
|
477
|
+
if opinion_strength:
|
|
478
|
+
op_exp.set('strength',str(oe.value))
|
|
479
|
+
|
|
480
|
+
op_ele.append(op_exp)
|
|
481
|
+
oe.ids.sort()
|
|
482
|
+
c = ' '.join(lemma_for_tid[tid] for tid in oe.ids)
|
|
483
|
+
op_exp.append(etree.Comment(c))
|
|
484
|
+
span_exp = etree.Element('span')
|
|
485
|
+
op_exp.append(span_exp)
|
|
486
|
+
for id in oe.ids:
|
|
487
|
+
span_exp.append(etree.Element('target',attrib={'id':id}))
|
|
488
|
+
|
|
489
|
+
##Append the op_ele to the opinions layer
|
|
490
|
+
my_kaf_tree.addElementToLayer('opinions', op_ele)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
my_kaf_tree.addLinguisticProcessor('Basic opinion detector with Pos','1.0','opinions', my_time_stamp)
|
|
494
|
+
my_kaf_tree.saveToFile(sys.stdout)
|
|
495
|
+
logging.debug('Process finished')
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
|