opener-constituent-parser-de 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +41 -0
- data/bin/constituent-parser-de +8 -0
- data/core/convert_penn_to_kaf.py +127 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +161 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +326 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/core/site-packages/pre_build/VUSentimentLexicon-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUSentimentLexicon-1.0-py2.7.egg-info/SOURCES.txt +6 -0
- data/core/site-packages/pre_build/VUSentimentLexicon-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUSentimentLexicon-1.0-py2.7.egg-info/installed-files.txt +19 -0
- data/core/site-packages/pre_build/VUSentimentLexicon-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/DE-lexicon/Sentiment-German-HotelDomain.xml +12847 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/DE-lexicon/germanLex.txt +8883 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/EN-lexicon/Sentiment-English-HotelDomain.xml +28197 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/EN-lexicon/Sentiment-English-general.xml +73998 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/ES-lexicon/es-sentiment_lexicon.lmf +106035 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/FR-lexicon/fr-sentiment_lexicon-old.lmf +232008 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/FR-lexicon/fr-sentiment_lexicon.lmf +141651 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/IT-lexicon/it-sentiment_lexicon.lmf +200790 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/LexiconMod.py +137 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/NL-lexicon/Sentiment-Dutch-HotelDomain.xml +15007 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/NL-lexicon/Sentiment-Dutch-general.xml +83143 -0
- data/core/site-packages/pre_build/VUSentimentLexicon/__init__.py +5 -0
- data/core/stanford_parser_de.py +142 -0
- data/core/tree.py +1438 -0
- data/core/vendor/stanford-parser/stanford-parser-2.0.5-models.jar +0 -0
- data/core/vendor/stanford-parser/stanford-parser.jar +0 -0
- data/ext/hack/Rakefile +13 -0
- data/ext/hack/support.rb +50 -0
- data/lib/opener/constituent_parsers/de.rb +100 -0
- data/lib/opener/constituent_parsers/de/version.rb +7 -0
- data/opener-constituent-parser-de.gemspec +34 -0
- data/pre_build_requirements.txt +1 -0
- data/pre_install_requirements.txt +1 -0
- metadata +139 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d5a9ae0201da32865ead9a1fe9524712c148204a
|
4
|
+
data.tar.gz: 609f1d0465fab2a08278cebdc484949d0fe98e72
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b1b526f3a0b0787591003013e54065d172aa82e32fb4b333537fdf2e2b6724926fc6e58e45180a53df36e03340f0f249a160918621b62d22896f6260f3f501f6
|
7
|
+
data.tar.gz: 126d01c13291d08eadc89203f9a8e3471d46e35360b004c1153d7a17fe1808684be3f65a731e47e19ccc89ff825a3fbaae34ea643e608c0cdb09345de1016007
|
data/README.md
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
[![Build Status](https://drone.io/github.com/opener-project/constituent-parser-de/status.png)](https://drone.io/github.com/opener-project/constituent-parser-de/latest)
|
2
|
+
|
3
|
+
VU-parser-DE_kernel
|
4
|
+
===================
|
5
|
+
|
6
|
+
Introduction
|
7
|
+
------------
|
8
|
+
|
9
|
+
This is a parser for German text using the Stanford parser (http://nlp.stanford.edu/software/lex-parser.shtml). The input for this module has to be a valid
|
10
|
+
KAF file with at least the text layer. The output will be the constituent trees in pennTreebank format for each of the sentences in the input KAF.
|
11
|
+
The tokenization and sentence splitting is taken from the input KAF file, so if your input file has a wrong tokenization/splitting, the output could
|
12
|
+
contain errors. The number of output constituent trees will be exactly the same as the number of sentences in your input KAF
|
13
|
+
|
14
|
+
Requirements
|
15
|
+
-----------
|
16
|
+
* VUKafParserPy: parser in python for KAF files (https://github.com/opener-project/VU-kaf-parser)
|
17
|
+
* lxml: library for processing xml in python
|
18
|
+
* Stanford parser: http://nlp.stanford.edu/software/lex-parser.shtml
|
19
|
+
|
20
|
+
Installation
|
21
|
+
-----------
|
22
|
+
Clone the repository to your local machine and set the varible STANFORD_HOME in the file core/stanford_parser_de.py
|
23
|
+
to point to your local folder of the Stanford parser.
|
24
|
+
|
25
|
+
How to run the module with Python
|
26
|
+
---------------------------------
|
27
|
+
|
28
|
+
You can run this module from the command line using Python. The main script is core/stanford_parser_de.py. This script reads the KAF from the standard input
|
29
|
+
and writes the output to the standard output, generating some log information in the standard error output. To process one file just run:
|
30
|
+
````shell
|
31
|
+
cat input.kaf | core/stanford_parser_de.py > input.tree
|
32
|
+
````
|
33
|
+
|
34
|
+
This will read the KAF file in "input.kaf" and will store the constituent trees in "input.tree".
|
35
|
+
|
36
|
+
|
37
|
+
Contact
|
38
|
+
------
|
39
|
+
* Ruben Izquierdo
|
40
|
+
* Vrije University of Amsterdam
|
41
|
+
* ruben.izquierdobevia@vu.nl
|
@@ -0,0 +1,127 @@
|
|
1
|
+
from lxml import etree
|
2
|
+
from tree import Tree
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
|
8
|
+
list_t = []
|
9
|
+
list_nt = []
|
10
|
+
list_edge = []
|
11
|
+
cnt_t = cnt_nt = cnt_edge = 0
|
12
|
+
|
13
|
+
##This function generates a "tree" xml element as defined in KAF from a string containing
|
14
|
+
##the penntreebank format and a list of term ids to do the linking
|
15
|
+
'''
|
16
|
+
s = '(S (NP (DET The) (NN dog)) (VP (V ate) (NP (DET the) (NN cat))) (. .))'
|
17
|
+
ids = ['t0 t1','t2','t3','t4','t5','t6']
|
18
|
+
tree_node = create_constituency_layer(s, ids)
|
19
|
+
e = etree.ElementTree(element=tree_node)
|
20
|
+
e.write(sys.stdout,pretty_print=True)
|
21
|
+
'''
|
22
|
+
def convert_penn_to_kaf(tree_str,term_ids,logging,lemma_for_termid,off_t,off_nt,off_edge):
|
23
|
+
global list_t, list_nt,list_edge,cnt_t,cnt_nt,cnt_edge
|
24
|
+
list_t = []
|
25
|
+
list_nt = []
|
26
|
+
list_edge = []
|
27
|
+
cnt_t = off_t
|
28
|
+
cnt_nt = off_nt
|
29
|
+
cnt_edge = off_edge
|
30
|
+
|
31
|
+
this_tree = Tree(tree_str)
|
32
|
+
logging.debug('\n'+str(this_tree))
|
33
|
+
|
34
|
+
|
35
|
+
for num, token in enumerate(this_tree.leaves()):
|
36
|
+
position = this_tree.leaf_treeposition(num)
|
37
|
+
token_id = term_ids[num]
|
38
|
+
this_tree[position] = token_id
|
39
|
+
logging.debug('Matching '+token+' with term id='+token_id+' which according to KAF lemma='+str(lemma_for_termid.get(token_id).encode('utf-8')))
|
40
|
+
|
41
|
+
|
42
|
+
##Creat the ROOT
|
43
|
+
create_extra_root = False
|
44
|
+
nt_id = None
|
45
|
+
if create_extra_root:
|
46
|
+
nt_id = 'nter'+str(cnt_nt)
|
47
|
+
cnt_nt +=1
|
48
|
+
list_nt.append((nt_id,'ROOT'))
|
49
|
+
|
50
|
+
visit_node(this_tree, nt_id)
|
51
|
+
|
52
|
+
root = etree.Element('tree')
|
53
|
+
nonter_heads = set()
|
54
|
+
#Nonter
|
55
|
+
labels_for_nt = {}
|
56
|
+
for nt_id, label in list_nt:
|
57
|
+
##Checking the head
|
58
|
+
if len(label)>=2 and label[-1]=='H' and label[-2]=='=':
|
59
|
+
nonter_heads.add(nt_id)
|
60
|
+
label = label[:-2]
|
61
|
+
ele = etree.Element('nt', attrib={'id':nt_id,'label':label})
|
62
|
+
labels_for_nt[nt_id] = label
|
63
|
+
root.append(ele)
|
64
|
+
|
65
|
+
## Terminals
|
66
|
+
lemma_for_ter = {}
|
67
|
+
for ter_id, span_ids in list_t:
|
68
|
+
ele = etree.Element('t',attrib={'id':ter_id})
|
69
|
+
span = etree.Element('span')
|
70
|
+
ele.append(span)
|
71
|
+
for termid in span_ids.split(' '):
|
72
|
+
target = etree.Element('target',attrib={'id':termid})
|
73
|
+
span.append(target)
|
74
|
+
lemma_for_ter[ter_id] = lemma_for_termid.get(termid,'unknown')
|
75
|
+
root.append(ele)
|
76
|
+
|
77
|
+
##Edges
|
78
|
+
#for edge_id,node_to,node_from in list_edge:
|
79
|
+
for edge_id, node_from, node_to in list_edge:
|
80
|
+
ele = etree.Element('edge',attrib={'id':edge_id,'from':node_from,'to':node_to})
|
81
|
+
|
82
|
+
## For the comment
|
83
|
+
##Only non-ter
|
84
|
+
label_to = labels_for_nt.get(node_to)
|
85
|
+
|
86
|
+
##Could be ter or nonter
|
87
|
+
label_from = labels_for_nt.get(node_from)
|
88
|
+
if label_from is None:
|
89
|
+
label_from = lemma_for_ter.get(node_from,'kk')
|
90
|
+
|
91
|
+
comment = ' '+(edge_id)+' '+(label_to)+' <- '+(label_from)+' '
|
92
|
+
|
93
|
+
if node_from in nonter_heads:
|
94
|
+
ele.set('head','yes')
|
95
|
+
root.append(etree.Comment(comment))
|
96
|
+
root.append(ele)
|
97
|
+
|
98
|
+
return root,cnt_t,cnt_nt,cnt_edge
|
99
|
+
|
100
|
+
|
101
|
+
def visit_node(node,id_parent=None):
|
102
|
+
global list_t, list_nt,list_edge,cnt_t,cnt_nt,cnt_edge
|
103
|
+
if isinstance(node,str): #is a terminal
|
104
|
+
##Create the terminal
|
105
|
+
t_id = 'ter'+str(cnt_t)
|
106
|
+
cnt_t += 1
|
107
|
+
list_t.append((t_id,str(node)))
|
108
|
+
|
109
|
+
##Create the edge with the parent
|
110
|
+
edge_id = 'tre'+str(cnt_edge)
|
111
|
+
cnt_edge +=1
|
112
|
+
list_edge.append((edge_id,t_id,id_parent))
|
113
|
+
else: #Is a non terminal
|
114
|
+
##Create the nonterminal
|
115
|
+
nt_id = 'nter'+str(cnt_nt)
|
116
|
+
cnt_nt += 1
|
117
|
+
list_nt.append((nt_id,node.node))
|
118
|
+
|
119
|
+
##Create the linking with the parent
|
120
|
+
if id_parent is not None:
|
121
|
+
edge_id = 'tre'+str(cnt_edge)
|
122
|
+
cnt_edge += 1
|
123
|
+
list_edge.append((edge_id,nt_id,id_parent))
|
124
|
+
|
125
|
+
##Call to the child
|
126
|
+
for child in node:
|
127
|
+
visit_node(child,nt_id)
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
../VUKafParserPy/KafParserMod.py
|
2
|
+
../VUKafParserPy/__init__.py
|
3
|
+
../VUKafParserPy/KafDataObjectsMod.py
|
4
|
+
../VUKafParserPy/KafParserMod.pyc
|
5
|
+
../VUKafParserPy/__init__.pyc
|
6
|
+
../VUKafParserPy/KafDataObjectsMod.pyc
|
7
|
+
./
|
8
|
+
top_level.txt
|
9
|
+
SOURCES.txt
|
10
|
+
PKG-INFO
|
11
|
+
dependency_links.txt
|
@@ -0,0 +1 @@
|
|
1
|
+
VUKafParserPy
|
@@ -0,0 +1,161 @@
|
|
1
|
+
class KafTermSentiment:
|
2
|
+
def __init__(self):
|
3
|
+
self.resource=None
|
4
|
+
self.polarity=None
|
5
|
+
self.strength=None
|
6
|
+
self.subjectivity=None
|
7
|
+
|
8
|
+
def simpleInit(self,r,p,st,su,sm=None):
|
9
|
+
self.resource=r
|
10
|
+
self.polarity=p
|
11
|
+
self.strength=st
|
12
|
+
self.subjectivity=su
|
13
|
+
self.sentiment_modifier = sm
|
14
|
+
|
15
|
+
def getPolarity(self):
|
16
|
+
return self.polarity
|
17
|
+
|
18
|
+
def getSentimentModifier(self):
|
19
|
+
return self.sentiment_modifier
|
20
|
+
|
21
|
+
|
22
|
+
class KafToken:
|
23
|
+
def __init__(self,wid, value, sent=None, para=None):
|
24
|
+
self.token_id = wid
|
25
|
+
self.value = value
|
26
|
+
self.sent = sent
|
27
|
+
self.para = para
|
28
|
+
|
29
|
+
|
30
|
+
class KafOpinionExpression:
|
31
|
+
def __init__(self,polarity,strength,targets):
|
32
|
+
self.polarity = polarity
|
33
|
+
self.strength = strength
|
34
|
+
self.targets = targets
|
35
|
+
|
36
|
+
def __str__(self):
|
37
|
+
return 'Op_exp==> pol:'+self.polarity+' Str:'+self.strength+' ids:'+'-'.join(self.targets)
|
38
|
+
|
39
|
+
class KafOpinion:
|
40
|
+
def __init__(self,id,holders, targets, opi_exp):
|
41
|
+
self.id = id
|
42
|
+
self.holders = holders
|
43
|
+
self.targets = targets
|
44
|
+
self.opi_exp = opi_exp
|
45
|
+
|
46
|
+
def __str__(self):
|
47
|
+
c='Opinion id'+self.id+'\n'
|
48
|
+
c+=' Holders: '+'-'.join(self.holders)+'\n'
|
49
|
+
c+=' Targets: '+'-'.join(self.targets)+'\n'
|
50
|
+
c+=str(self.opi_exp)
|
51
|
+
return c
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
class KafSingleProperty:
|
56
|
+
def __init__(self,id,type,targets):
|
57
|
+
self.id = id
|
58
|
+
self.type = type
|
59
|
+
self.targets = targets
|
60
|
+
|
61
|
+
|
62
|
+
def get_id(self):
|
63
|
+
return self.id
|
64
|
+
|
65
|
+
def get_type(self):
|
66
|
+
return self.type
|
67
|
+
|
68
|
+
def get_span(self):
|
69
|
+
return self.targets
|
70
|
+
|
71
|
+
def __str__(self):
|
72
|
+
return 'Id: '+self.id+' Type: '+self.type+' ids:'+' '.join(self.targets)
|
73
|
+
|
74
|
+
|
75
|
+
class KafSingleEntity:
|
76
|
+
def __init__(self,id,type,targets):
|
77
|
+
self.id = id
|
78
|
+
self.type = type
|
79
|
+
self.targets = targets
|
80
|
+
|
81
|
+
def get_id(self):
|
82
|
+
return self.id
|
83
|
+
|
84
|
+
def get_type(self):
|
85
|
+
return self.type
|
86
|
+
|
87
|
+
def get_span(self):
|
88
|
+
return self.targets
|
89
|
+
|
90
|
+
def __str__(self):
|
91
|
+
return 'Id: '+self.id+' Type: '+self.type+' ids:'+' '.join(self.targets)
|
92
|
+
|
93
|
+
class KafTerm:
|
94
|
+
def __init__(self):
|
95
|
+
self.tid = None
|
96
|
+
self.lemma = None
|
97
|
+
self.pos = None
|
98
|
+
self.sentiment = None
|
99
|
+
self.list_span_id = []
|
100
|
+
|
101
|
+
def set_list_span_id(self, L):
|
102
|
+
self.list_span_id = L
|
103
|
+
|
104
|
+
def get_list_span(self):
|
105
|
+
return self.list_span_id
|
106
|
+
|
107
|
+
def get_polarity(self):
|
108
|
+
if self.sentiment != None:
|
109
|
+
return self.sentiment.getPolarity()
|
110
|
+
else:
|
111
|
+
return None
|
112
|
+
|
113
|
+
def get_sentiment_modifier(self):
|
114
|
+
if self.sentiment != None:
|
115
|
+
return self.sentiment.getSentimentModifier()
|
116
|
+
else:
|
117
|
+
return None
|
118
|
+
|
119
|
+
|
120
|
+
def setSentiment(self,my_sent):
|
121
|
+
self.sentiment = my_sent
|
122
|
+
|
123
|
+
def getSentiment(self):
|
124
|
+
return self.sentiment
|
125
|
+
|
126
|
+
def getLemma(self):
|
127
|
+
return self.lemma
|
128
|
+
|
129
|
+
def setLemma(self,lemma):
|
130
|
+
self.lemma = lemma
|
131
|
+
|
132
|
+
def getPos(self):
|
133
|
+
return self.pos
|
134
|
+
|
135
|
+
def setPos(self,pos):
|
136
|
+
self.pos = pos
|
137
|
+
|
138
|
+
def getId(self):
|
139
|
+
return self.tid
|
140
|
+
|
141
|
+
def setId(self,id):
|
142
|
+
self.tid = id
|
143
|
+
|
144
|
+
def getShortPos(self):
|
145
|
+
if self.pos==None:
|
146
|
+
return None
|
147
|
+
auxpos=self.pos.lower()[0]
|
148
|
+
if auxpos == 'g': auxpos='a'
|
149
|
+
elif auxpos == 'a': auxpos='r'
|
150
|
+
return auxpos
|
151
|
+
|
152
|
+
def __str__(self):
|
153
|
+
if self.tid and self.lemma and self.pos:
|
154
|
+
return self.tid+'\n\t'+self.lemma.encode('utf-8')+'\n\t'+self.pos
|
155
|
+
else:
|
156
|
+
return 'None'
|
157
|
+
|
158
|
+
|
159
|
+
|
160
|
+
|
161
|
+
|
@@ -0,0 +1,326 @@
|
|
1
|
+
########################################################################
|
2
|
+
# 14 Jan 2013: added function add_attrs_to_layer
|
3
|
+
########################################################################
|
4
|
+
|
5
|
+
###################
|
6
|
+
# List of changes #
|
7
|
+
###################
|
8
|
+
# 14 Jan 2013: added function add_attrs_to_layer
|
9
|
+
# 27 Feb 2013: added code for comply with DTD
|
10
|
+
# 18 Jun 2013: getSingleProperties adapted to the structure KAF/features/properties/property/references/span/target
|
11
|
+
# 18 Jun 2013: funcion add_property created for adding the properties to the KAF
|
12
|
+
from lxml import etree
|
13
|
+
from KafDataObjectsMod import *
|
14
|
+
import time
|
15
|
+
|
16
|
+
class KafParser:
|
17
|
+
def __init__(self,filename=None):
|
18
|
+
self.tree=None
|
19
|
+
self.__pathForToken={}
|
20
|
+
|
21
|
+
if filename:
|
22
|
+
self.tree = etree.parse(filename,etree.XMLParser(remove_blank_text=True))
|
23
|
+
## Do the text tokenization
|
24
|
+
self.__textTokenization()
|
25
|
+
else:
|
26
|
+
root = etree.Element('KAF')
|
27
|
+
root.set('version','v1.opener')
|
28
|
+
root.set('{http://www.w3.org/XML/1998/namespace}lang','en')
|
29
|
+
self.tree = etree.ElementTree(element=root)
|
30
|
+
|
31
|
+
def __textTokenization(self):
|
32
|
+
for wf in self.tree.findall('text/wf'):
|
33
|
+
wid = wf.get('wid')
|
34
|
+
self.__pathForToken[wid] = self.tree.getpath(wf)
|
35
|
+
|
36
|
+
|
37
|
+
def getToken(self,tid):
|
38
|
+
path = self.__pathForToken[tid]
|
39
|
+
return self.tree.xpath(self.__pathForToken[tid])[0]
|
40
|
+
|
41
|
+
|
42
|
+
def getLanguage(self):
|
43
|
+
lang = self.tree.getroot().get('{http://www.w3.org/XML/1998/namespace}lang','nl')
|
44
|
+
return lang
|
45
|
+
|
46
|
+
def getTokens(self):
|
47
|
+
for element in self.tree.findall('text/wf'):
|
48
|
+
w_id = element.get('wid')
|
49
|
+
s_id = element.get('sent','0')
|
50
|
+
word = element.text
|
51
|
+
yield (word, s_id, w_id)
|
52
|
+
|
53
|
+
def getTerms(self):
|
54
|
+
if self.tree:
|
55
|
+
for element in self.tree.findall('terms/term'):
|
56
|
+
kafTermObj = KafTerm()
|
57
|
+
kafTermObj.setId(element.get('tid'))
|
58
|
+
kafTermObj.setLemma(element.get('lemma'))
|
59
|
+
kafTermObj.setPos(element.get('pos'))
|
60
|
+
|
61
|
+
## Parsing sentiment
|
62
|
+
sentiment = element.find('sentiment')
|
63
|
+
if sentiment is not None:
|
64
|
+
resource = sentiment.get('resource','')
|
65
|
+
polarity = sentiment.get('polarity',None)
|
66
|
+
strength = sentiment.get('strength','')
|
67
|
+
subjectivity = sentiment.get('subjectivity','')
|
68
|
+
sentiment_modifier = sentiment.get('sentiment_modifier')
|
69
|
+
|
70
|
+
my_sent = KafTermSentiment()
|
71
|
+
my_sent.simpleInit(resource,polarity,strength,subjectivity,sentiment_modifier)
|
72
|
+
kafTermObj.setSentiment(my_sent)
|
73
|
+
|
74
|
+
## Parsing the span
|
75
|
+
span = element.find('span')
|
76
|
+
if span is not None:
|
77
|
+
list_ids = [target.get('id') for target in span.findall('target')]
|
78
|
+
kafTermObj.set_list_span_id(list_ids)
|
79
|
+
|
80
|
+
|
81
|
+
yield kafTermObj
|
82
|
+
else:
|
83
|
+
return
|
84
|
+
|
85
|
+
|
86
|
+
def getSentimentTriples(self):
|
87
|
+
data = []
|
88
|
+
if self.tree:
|
89
|
+
for term_element in self.tree.findall('terms/term'):
|
90
|
+
lemma = term_element.get('lemma')
|
91
|
+
polarity = None
|
92
|
+
sentiment_modifier = None
|
93
|
+
|
94
|
+
sentiment_element = term_element.find('sentiment')
|
95
|
+
if sentiment_element is not None:
|
96
|
+
polarity = sentiment_element.get('polarity',None)
|
97
|
+
sentiment_modifier = sentiment_element.get('sentiment_modifier')
|
98
|
+
data.append( (lemma,polarity,sentiment_modifier))
|
99
|
+
return data
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
def addPolarityToTerm(self,termid,my_sentiment_attribs,polarity_pos=None):
|
104
|
+
if self.tree:
|
105
|
+
for element in self.tree.find('terms'):
|
106
|
+
if element.get('tid','')==termid:
|
107
|
+
|
108
|
+
#In case there is no pos info, we use the polarityPos
|
109
|
+
if not element.get('pos') and polarity_pos is not None:
|
110
|
+
element.set('pos',polarity_pos)
|
111
|
+
sentEle = etree.Element('sentiment',attrib=my_sentiment_attribs)
|
112
|
+
element.append(sentEle)
|
113
|
+
|
114
|
+
def saveToFile(self,filename,myencoding='UTF-8'):
|
115
|
+
if self.tree:
|
116
|
+
self.tree.write(filename,encoding=myencoding,pretty_print=True,xml_declaration=True)
|
117
|
+
|
118
|
+
|
119
|
+
def addLinguisticProcessor(self,name,version, layer, time_stamp=True):
|
120
|
+
aux = self.tree.findall('kafHeader')
|
121
|
+
if len(aux)!=0:
|
122
|
+
kaf_header = aux[0]
|
123
|
+
else:
|
124
|
+
kaf_header = etree.Element('kafHeader')
|
125
|
+
self.tree.getroot().insert(0,kaf_header)
|
126
|
+
|
127
|
+
## Check if there is already element for the layer
|
128
|
+
my_lp_ele = None
|
129
|
+
|
130
|
+
for element in kaf_header.findall('linguisticProcessors'):
|
131
|
+
if element.get('layer','')==layer:
|
132
|
+
my_lp_ele = element
|
133
|
+
break
|
134
|
+
|
135
|
+
if time_stamp:
|
136
|
+
my_time = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
|
137
|
+
else:
|
138
|
+
my_time = '*'
|
139
|
+
|
140
|
+
my_lp = etree.Element('lp')
|
141
|
+
my_lp.set('timestamp',my_time)
|
142
|
+
my_lp.set('version',version)
|
143
|
+
my_lp.set('name',name)
|
144
|
+
|
145
|
+
if my_lp_ele is not None: #Already an element for linguisticProcessor with the layer
|
146
|
+
my_lp_ele.append(my_lp)
|
147
|
+
else:
|
148
|
+
# Create a new element for the LP layer
|
149
|
+
my_lp_ele = etree.Element('linguisticProcessor')
|
150
|
+
my_lp_ele.set('layer',layer)
|
151
|
+
my_lp_ele.append(my_lp)
|
152
|
+
#my_lp_ele.tail=my_lp_ele.text='\n'
|
153
|
+
## Should be inserted after the last linguisticProcessor element (stored in variable element)
|
154
|
+
idx = kaf_header.index(element)
|
155
|
+
kaf_header.insert(idx+1,my_lp_ele)
|
156
|
+
|
157
|
+
|
158
|
+
def addLayer(self,type,element,first_char_id=None):
|
159
|
+
if first_char_id is None:
|
160
|
+
first_char_id = type[0]
|
161
|
+
|
162
|
+
## Check if there is already layer for the type
|
163
|
+
layer_element = self.tree.find(type)
|
164
|
+
|
165
|
+
if layer_element is None:
|
166
|
+
layer_element = etree.Element(type)
|
167
|
+
self.tree.getroot().append(layer_element)
|
168
|
+
## The id is going to be the first one
|
169
|
+
new_id = first_char_id+'1'
|
170
|
+
else:
|
171
|
+
## We need to know how many elements there are in the layer
|
172
|
+
current_n = len(layer_element.getchildren())
|
173
|
+
new_id = first_char_id+''+str(current_n+1)
|
174
|
+
|
175
|
+
|
176
|
+
## In this point layer_element points to the correct element, existing or created
|
177
|
+
|
178
|
+
element.set(first_char_id+'id',new_id)
|
179
|
+
layer_element.append(element)
|
180
|
+
return new_id
|
181
|
+
|
182
|
+
def addElementToLayer(self,layer, element,first_char_id=None):
|
183
|
+
return self.addLayer(layer,element,first_char_id)
|
184
|
+
|
185
|
+
def add_attrs_to_layer(self,layer,attrs):
|
186
|
+
layer_element = self.tree.find(layer)
|
187
|
+
if layer_element is not None:
|
188
|
+
for att, val in attrs.items():
|
189
|
+
layer_element.set(att,val)
|
190
|
+
|
191
|
+
|
192
|
+
def addAttributeToElement(self,path,str_id, id, attribute, value,sub_path=None):
|
193
|
+
for element in self.tree.findall(path):
|
194
|
+
if id is not None and element.get(str_id,None) == id:
|
195
|
+
if sub_path is not None:
|
196
|
+
elements = element.findall(sub_path)
|
197
|
+
if len(elements)!=0: element = elements[0]
|
198
|
+
element.set(attribute,value)
|
199
|
+
return
|
200
|
+
|
201
|
+
|
202
|
+
## This works with the original definition of the property layer
|
203
|
+
## KAF -> properties -> property* -> span* -> target*
|
204
|
+
def getSingleProperties_old(self):
|
205
|
+
for element in self.tree.findall('properties/property'):
|
206
|
+
my_id = element.get('pid')
|
207
|
+
my_type = element.get('type')
|
208
|
+
ref = element.find('references')
|
209
|
+
if ref is not None:
|
210
|
+
element = ref
|
211
|
+
for span_element in element.findall('span'):
|
212
|
+
target_ids = [target_element.get('id') for target_element in span_element.findall('target')]
|
213
|
+
my_prop = KafSingleProperty(my_id,my_type,target_ids)
|
214
|
+
yield my_prop
|
215
|
+
|
216
|
+
## 18-June-2013
|
217
|
+
def getSingleProperties(self):
|
218
|
+
for property in self.tree.findall('features/properties/property'):
|
219
|
+
my_id = property.get('pid')
|
220
|
+
if my_id is None:
|
221
|
+
my_id = property.get('fpid')
|
222
|
+
my_type = property.get('lemma')
|
223
|
+
for span_element in property.findall('references/span'):
|
224
|
+
target_ids = [target_element.get('id') for target_element in span_element.findall('target')]
|
225
|
+
my_prop = KafSingleProperty(my_id,my_type,target_ids)
|
226
|
+
yield my_prop
|
227
|
+
|
228
|
+
# This function adds a new property of the type given with the list of ids given
|
229
|
+
# my_type -> 'sleeping comfort' list_ids = ['id1','id2']
|
230
|
+
# It creates the features/properties layers in case
|
231
|
+
# Agglomerates all the properties for the same TYPE under the same property element
|
232
|
+
# It calculates automatically the number for the identifier depending on the number
|
233
|
+
# of properties existing
|
234
|
+
def add_property(self,my_type,list_ids):
|
235
|
+
|
236
|
+
#Looking for feature layer or creating it
|
237
|
+
feature_layer = self.tree.find('features')
|
238
|
+
if feature_layer is None:
|
239
|
+
feature_layer = etree.Element('features')
|
240
|
+
self.tree.getroot().append(feature_layer)
|
241
|
+
|
242
|
+
#Looking for properties layer
|
243
|
+
properties_layer = feature_layer.find('properties')
|
244
|
+
if properties_layer is None:
|
245
|
+
properties_layer = etree.Element('properties')
|
246
|
+
feature_layer.append(properties_layer)
|
247
|
+
|
248
|
+
num_props = 0
|
249
|
+
property_layer = None
|
250
|
+
for property in properties_layer.findall('property'):
|
251
|
+
num_props += 1
|
252
|
+
prop_type = property.get('lemma')
|
253
|
+
if prop_type == my_type:
|
254
|
+
property_layer = property
|
255
|
+
break
|
256
|
+
|
257
|
+
if property_layer is None: # There is no any property for that type, let's create one
|
258
|
+
property_layer = etree.Element('property')
|
259
|
+
property_layer.set('pid','p'+str(num_props+1))
|
260
|
+
property_layer.set('lemma',my_type)
|
261
|
+
properties_layer.append(property_layer)
|
262
|
+
|
263
|
+
|
264
|
+
references = property_layer.find('references')
|
265
|
+
if references is None:
|
266
|
+
references = etree.Element('references')
|
267
|
+
property_layer.append(references)
|
268
|
+
## Create the new span
|
269
|
+
span = etree.Element('span')
|
270
|
+
references.append(span)
|
271
|
+
for my_id in list_ids:
|
272
|
+
span.append(etree.Element('target',attrib={'id':my_id}))
|
273
|
+
|
274
|
+
|
275
|
+
|
276
|
+
|
277
|
+
def getSingleEntities(self):
|
278
|
+
for element in self.tree.findall('entities/entity'):
|
279
|
+
my_id = element.get('eid')
|
280
|
+
my_type = element.get('type')
|
281
|
+
my_path_to_span = None
|
282
|
+
ref = element.find('references')
|
283
|
+
if ref is not None:
|
284
|
+
my_path_to_span = 'references/span'
|
285
|
+
else:
|
286
|
+
my_path_to_span = 'span'
|
287
|
+
|
288
|
+
for span_element in element.findall(my_path_to_span):
|
289
|
+
target_ids = [target_element.get('id') for target_element in span_element.findall('target')]
|
290
|
+
my_prop = KafSingleEntity(my_id,my_type,target_ids)
|
291
|
+
yield my_prop
|
292
|
+
|
293
|
+
|
294
|
+
def getOpinions(self):
|
295
|
+
for element in self.tree.findall('opinions/opinion'):
|
296
|
+
my_id = element.get('oid')
|
297
|
+
|
298
|
+
tar_ids_hol = []
|
299
|
+
tar_ids_tar = []
|
300
|
+
polarity = strenght = ''
|
301
|
+
tar_ids_exp = []
|
302
|
+
|
303
|
+
#Holder
|
304
|
+
opi_hol_eles = element.findall('opinion_holder')
|
305
|
+
if len(opi_hol_eles)!=0:
|
306
|
+
opi_hol_ele = opi_hol_eles[0]
|
307
|
+
tar_ids_hol = [t_ele.get('id') for t_ele in opi_hol_ele.findall('span/target')]
|
308
|
+
|
309
|
+
#Target
|
310
|
+
opi_tar_eles = element.findall('opinion_target')
|
311
|
+
if len(opi_tar_eles) != 0:
|
312
|
+
opi_tar_ele = opi_tar_eles[0]
|
313
|
+
tar_ids_tar = [t_ele.get('id') for t_ele in opi_tar_ele.findall('span/target')]
|
314
|
+
|
315
|
+
## Opinion expression
|
316
|
+
opi_exp_eles = element.findall('opinion_expression')
|
317
|
+
if len(opi_exp_eles) != 0:
|
318
|
+
opi_exp_ele = opi_exp_eles[0]
|
319
|
+
polarity = opi_exp_ele.get('polarity','')
|
320
|
+
strength = opi_exp_ele.get('strength','')
|
321
|
+
tar_ids_exp = [t_ele.get('id') for t_ele in opi_exp_ele.findall('span/target')]
|
322
|
+
|
323
|
+
yield KafOpinion(my_id,tar_ids_hol, tar_ids_tar, KafOpinionExpression(polarity, strength,tar_ids_exp))
|
324
|
+
|
325
|
+
|
326
|
+
|