opener-property-tagger 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +62 -0
- data/bin/property-tagger +7 -0
- data/bin/property-tagger-server +8 -0
- data/config.ru +5 -0
- data/core/extract_aspects.py +18 -0
- data/core/hotel_property_tagger_nl_en.py +133 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/ext/hack/Rakefile +13 -0
- data/ext/hack/support.rb +38 -0
- data/lib/opener/property_tagger.rb +86 -0
- data/lib/opener/property_tagger/cli.rb +84 -0
- data/lib/opener/property_tagger/public/markdown.css +284 -0
- data/lib/opener/property_tagger/server.rb +16 -0
- data/lib/opener/property_tagger/version.rb +5 -0
- data/lib/opener/property_tagger/views/index.erb +97 -0
- data/lib/opener/property_tagger/views/result.erb +15 -0
- data/opener-property-tagger.gemspec +37 -0
- data/pre_build_requirements.txt +1 -0
- metadata +183 -0
@@ -0,0 +1,439 @@
|
|
1
|
+
########################################################################
|
2
|
+
# 14 Jan 2013: added function add_attrs_to_layer
|
3
|
+
########################################################################
|
4
|
+
|
5
|
+
###################
|
6
|
+
# List of changes #
|
7
|
+
###################
|
8
|
+
# 14 Jan 2013: added function add_attrs_to_layer
|
9
|
+
# 27 Feb 2013: added code for comply with DTD
|
10
|
+
# 18 Jun 2013: getSingleProperties adapted to the structure KAF/features/properties/property/references/span/target
|
11
|
+
# 18 Jun 2013: funcion add_property created for adding the properties to the KAF
|
12
|
+
|
13
|
+
|
14
|
+
from lxml import etree
|
15
|
+
from KafDataObjectsMod import *
|
16
|
+
import time
|
17
|
+
|
18
|
+
class KafParser:
|
19
|
+
def __init__(self,filename=None):
|
20
|
+
self.tree=None
|
21
|
+
self.__pathForToken={}
|
22
|
+
self.__term_ids_for_token_id = None
|
23
|
+
|
24
|
+
if filename:
|
25
|
+
#self.tree = etree.parse(filename,etree.XMLParser(remove_blank_text=True))
|
26
|
+
self.tree = etree.parse(filename,etree.XMLParser(remove_blank_text=True, strip_cdata=False))
|
27
|
+
## Do the text tokenization
|
28
|
+
self.__textTokenization()
|
29
|
+
else:
|
30
|
+
root = etree.Element('KAF')
|
31
|
+
root.set('version','v1.opener')
|
32
|
+
root.set('{http://www.w3.org/XML/1998/namespace}lang','en')
|
33
|
+
self.tree = etree.ElementTree(element=root)
|
34
|
+
|
35
|
+
def __textTokenization(self):
|
36
|
+
for wf in self.tree.findall('text/wf'):
|
37
|
+
wid = wf.get('wid')
|
38
|
+
self.__pathForToken[wid] = self.tree.getpath(wf)
|
39
|
+
|
40
|
+
|
41
|
+
def getToken(self,tid):
|
42
|
+
if tid in self.__pathForToken:
|
43
|
+
path = self.__pathForToken[tid]
|
44
|
+
return self.tree.xpath(self.__pathForToken[tid])[0]
|
45
|
+
return None
|
46
|
+
|
47
|
+
|
48
|
+
def getLanguage(self):
|
49
|
+
lang = self.tree.getroot().get('{http://www.w3.org/XML/1998/namespace}lang','nl')
|
50
|
+
return lang
|
51
|
+
|
52
|
+
## Return a list of (sentence_id, TOKENS) where tokens is a list of (token_id,token)
|
53
|
+
## [(s_id1, T1), (sent_id2, T2)....]
|
54
|
+
## T1 --> [(tokenid, token), (tokenid2,token2)....]
|
55
|
+
def get_tokens_in_sentences(self):
|
56
|
+
sents = []
|
57
|
+
current = []
|
58
|
+
previous_sent = None
|
59
|
+
for element in self.tree.findall('text/wf'):
|
60
|
+
w_id = element.get('wid')
|
61
|
+
s_id = element.get('sent')
|
62
|
+
word = element.text
|
63
|
+
|
64
|
+
if previous_sent is not None and s_id != previous_sent:
|
65
|
+
sents.append((previous_sent,current))
|
66
|
+
current = []
|
67
|
+
current.append((w_id,word))
|
68
|
+
previous_sent = s_id
|
69
|
+
####
|
70
|
+
sents.append((s_id,current))
|
71
|
+
return sents
|
72
|
+
|
73
|
+
def get_term_ids_for_token_id(self,tok_id):
|
74
|
+
if self.__term_ids_for_token_id is None:
|
75
|
+
self.__term_ids_for_token_id = {}
|
76
|
+
for element in self.tree.findall('terms/term'):
|
77
|
+
term_id = element.get('tid')
|
78
|
+
for target in element.findall('span/target'):
|
79
|
+
token_id = target.get('id')
|
80
|
+
if token_id not in self.__term_ids_for_token_id:
|
81
|
+
self.__term_ids_for_token_id[token_id] = [term_id]
|
82
|
+
else:
|
83
|
+
self.__term_ids_for_token_id[token_id].append(term_id)
|
84
|
+
return self.__term_ids_for_token_id.get(tok_id,[])
|
85
|
+
|
86
|
+
|
87
|
+
|
88
|
+
def getTokens(self):
|
89
|
+
for element in self.tree.findall('text/wf'):
|
90
|
+
w_id = element.get('wid')
|
91
|
+
s_id = element.get('sent','0')
|
92
|
+
word = element.text
|
93
|
+
yield (word, s_id, w_id)
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
def getTerms(self):
|
98
|
+
if self.tree:
|
99
|
+
for element in self.tree.findall('terms/term'):
|
100
|
+
kafTermObj = KafTerm()
|
101
|
+
kafTermObj.setId(element.get('tid'))
|
102
|
+
kafTermObj.setLemma(element.get('lemma'))
|
103
|
+
kafTermObj.setPos(element.get('pos'))
|
104
|
+
kafTermObj.morphofeat = element.get('morphofeat')
|
105
|
+
|
106
|
+
## Parsing sentiment
|
107
|
+
sentiment = element.find('sentiment')
|
108
|
+
if sentiment is not None:
|
109
|
+
resource = sentiment.get('resource','')
|
110
|
+
polarity = sentiment.get('polarity',None)
|
111
|
+
strength = sentiment.get('strength','')
|
112
|
+
subjectivity = sentiment.get('subjectivity','')
|
113
|
+
sentiment_modifier = sentiment.get('sentiment_modifier')
|
114
|
+
|
115
|
+
my_sent = KafTermSentiment()
|
116
|
+
my_sent.simpleInit(resource,polarity,strength,subjectivity,sentiment_modifier)
|
117
|
+
kafTermObj.setSentiment(my_sent)
|
118
|
+
|
119
|
+
## Parsing the span
|
120
|
+
span = element.find('span')
|
121
|
+
if span is not None:
|
122
|
+
list_ids = [target.get('id') for target in span.findall('target')]
|
123
|
+
kafTermObj.set_list_span_id(list_ids)
|
124
|
+
|
125
|
+
|
126
|
+
yield kafTermObj
|
127
|
+
else:
|
128
|
+
return
|
129
|
+
|
130
|
+
|
131
|
+
def getSentimentTriples(self):
|
132
|
+
data = []
|
133
|
+
if self.tree:
|
134
|
+
for term_element in self.tree.findall('terms/term'):
|
135
|
+
lemma = term_element.get('lemma')
|
136
|
+
polarity = None
|
137
|
+
sentiment_modifier = None
|
138
|
+
|
139
|
+
sentiment_element = term_element.find('sentiment')
|
140
|
+
if sentiment_element is not None:
|
141
|
+
polarity = sentiment_element.get('polarity',None)
|
142
|
+
sentiment_modifier = sentiment_element.get('sentiment_modifier')
|
143
|
+
data.append( (lemma,polarity,sentiment_modifier))
|
144
|
+
return data
|
145
|
+
|
146
|
+
|
147
|
+
|
148
|
+
def addPolarityToTerm(self,termid,my_sentiment_attribs,polarity_pos=None):
|
149
|
+
if self.tree:
|
150
|
+
for element in self.tree.find('terms'):
|
151
|
+
if element.get('tid','')==termid:
|
152
|
+
|
153
|
+
#In case there is no pos info, we use the polarityPos
|
154
|
+
if not element.get('pos') and polarity_pos is not None:
|
155
|
+
element.set('pos',polarity_pos)
|
156
|
+
sentEle = etree.Element('sentiment',attrib=my_sentiment_attribs)
|
157
|
+
element.append(sentEle)
|
158
|
+
|
159
|
+
def saveToFile(self,filename,myencoding='UTF-8'):
|
160
|
+
if self.tree:
|
161
|
+
self.tree.write(filename,encoding=myencoding,pretty_print=True,xml_declaration=True)
|
162
|
+
|
163
|
+
|
164
|
+
def addLinguisticProcessor(self,name,version, layer, time_stamp=True):
|
165
|
+
aux = self.tree.findall('kafHeader')
|
166
|
+
if len(aux)!=0:
|
167
|
+
kaf_header = aux[0]
|
168
|
+
else:
|
169
|
+
kaf_header = etree.Element('kafHeader')
|
170
|
+
self.tree.getroot().insert(0,kaf_header)
|
171
|
+
|
172
|
+
aux2= kaf_header.findall('linguisticProcessors')
|
173
|
+
if len(aux2) == 0:
|
174
|
+
new_lp = etree.Element('linguisticProcessors')
|
175
|
+
new_lp.set('layer',layer)
|
176
|
+
kaf_header.append(new_lp)
|
177
|
+
|
178
|
+
## Check if there is already element for the layer
|
179
|
+
my_lp_ele = None
|
180
|
+
|
181
|
+
for element in kaf_header.findall('linguisticProcessors'):
|
182
|
+
if element.get('layer','')==layer:
|
183
|
+
my_lp_ele = element
|
184
|
+
break
|
185
|
+
|
186
|
+
if time_stamp:
|
187
|
+
my_time = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
|
188
|
+
else:
|
189
|
+
my_time = '*'
|
190
|
+
|
191
|
+
my_lp = etree.Element('lp')
|
192
|
+
my_lp.set('timestamp',my_time)
|
193
|
+
my_lp.set('version',version)
|
194
|
+
my_lp.set('name',name)
|
195
|
+
|
196
|
+
if my_lp_ele is not None: #Already an element for linguisticProcessor with the layer
|
197
|
+
my_lp_ele.append(my_lp)
|
198
|
+
else:
|
199
|
+
# Create a new element for the LP layer
|
200
|
+
my_lp_ele = etree.Element('linguisticProcessors')
|
201
|
+
my_lp_ele.set('layer',layer)
|
202
|
+
my_lp_ele.append(my_lp)
|
203
|
+
#my_lp_ele.tail=my_lp_ele.text='\n'
|
204
|
+
## Should be inserted after the last linguisticProcessor element (stored in variable element)
|
205
|
+
idx = kaf_header.index(element)
|
206
|
+
kaf_header.insert(idx+1,my_lp_ele)
|
207
|
+
|
208
|
+
|
209
|
+
def addLayer(self,type,element,first_char_id=None):
|
210
|
+
if first_char_id is None:
|
211
|
+
first_char_id = type[0]
|
212
|
+
|
213
|
+
## Check if there is already layer for the type
|
214
|
+
layer_element = self.tree.find(type)
|
215
|
+
|
216
|
+
if layer_element is None:
|
217
|
+
layer_element = etree.Element(type)
|
218
|
+
self.tree.getroot().append(layer_element)
|
219
|
+
## The id is going to be the first one
|
220
|
+
new_id = first_char_id+'1'
|
221
|
+
else:
|
222
|
+
## We need to know how many elements there are in the layer
|
223
|
+
current_n = len(layer_element.getchildren())
|
224
|
+
new_id = first_char_id+''+str(current_n+1)
|
225
|
+
|
226
|
+
|
227
|
+
## In this point layer_element points to the correct element, existing or created
|
228
|
+
|
229
|
+
element.set(first_char_id+'id',new_id)
|
230
|
+
layer_element.append(element)
|
231
|
+
return new_id
|
232
|
+
|
233
|
+
def addElementToLayer(self,layer, element,first_char_id=None):
|
234
|
+
return self.addLayer(layer,element,first_char_id)
|
235
|
+
|
236
|
+
def add_attrs_to_layer(self,layer,attrs):
|
237
|
+
layer_element = self.tree.find(layer)
|
238
|
+
if layer_element is not None:
|
239
|
+
for att, val in attrs.items():
|
240
|
+
layer_element.set(att,val)
|
241
|
+
|
242
|
+
|
243
|
+
def addAttributeToElement(self,path,str_id, id, attribute, value,sub_path=None):
|
244
|
+
for element in self.tree.findall(path):
|
245
|
+
if id is not None and element.get(str_id,None) == id:
|
246
|
+
if sub_path is not None:
|
247
|
+
elements = element.findall(sub_path)
|
248
|
+
if len(elements)!=0: element = elements[0]
|
249
|
+
element.set(attribute,value)
|
250
|
+
return
|
251
|
+
|
252
|
+
|
253
|
+
## This works with the original definition of the property layer
|
254
|
+
## KAF -> properties -> property* -> span* -> target*
|
255
|
+
def getSingleProperties_old(self):
|
256
|
+
for element in self.tree.findall('properties/property'):
|
257
|
+
my_id = element.get('pid')
|
258
|
+
my_type = element.get('type')
|
259
|
+
ref = element.find('references')
|
260
|
+
if ref is not None:
|
261
|
+
element = ref
|
262
|
+
for span_element in element.findall('span'):
|
263
|
+
target_ids = [target_element.get('id') for target_element in span_element.findall('target')]
|
264
|
+
my_prop = KafSingleProperty(my_id,my_type,target_ids)
|
265
|
+
yield my_prop
|
266
|
+
|
267
|
+
## 18-June-2013
|
268
|
+
def getSingleProperties(self):
|
269
|
+
for property in self.tree.findall('features/properties/property'):
|
270
|
+
my_id = property.get('pid')
|
271
|
+
if my_id is None:
|
272
|
+
my_id = property.get('fpid')
|
273
|
+
my_type = property.get('lemma')
|
274
|
+
for span_element in property.findall('references/span'):
|
275
|
+
target_ids = [target_element.get('id') for target_element in span_element.findall('target')]
|
276
|
+
my_prop = KafSingleProperty(my_id,my_type,target_ids)
|
277
|
+
yield my_prop
|
278
|
+
|
279
|
+
# This function adds a new property of the type given with the list of ids given
|
280
|
+
# my_type -> 'sleeping comfort' list_ids = ['id1','id2']
|
281
|
+
# It creates the features/properties layers in case
|
282
|
+
# Agglomerates all the properties for the same TYPE under the same property element
|
283
|
+
# It calculates automatically the number for the identifier depending on the number
|
284
|
+
# of properties existing
|
285
|
+
def add_property(self,my_type,list_ids,comment=None):
|
286
|
+
|
287
|
+
#Looking for feature layer or creating it
|
288
|
+
feature_layer = self.tree.find('features')
|
289
|
+
if feature_layer is None:
|
290
|
+
feature_layer = etree.Element('features')
|
291
|
+
self.tree.getroot().append(feature_layer)
|
292
|
+
|
293
|
+
#Looking for properties layer
|
294
|
+
properties_layer = feature_layer.find('properties')
|
295
|
+
if properties_layer is None:
|
296
|
+
properties_layer = etree.Element('properties')
|
297
|
+
feature_layer.append(properties_layer)
|
298
|
+
|
299
|
+
num_props = 0
|
300
|
+
property_layer = None
|
301
|
+
for property in properties_layer.findall('property'):
|
302
|
+
num_props += 1
|
303
|
+
prop_type = property.get('lemma')
|
304
|
+
if prop_type == my_type:
|
305
|
+
property_layer = property
|
306
|
+
break
|
307
|
+
|
308
|
+
if property_layer is None: # There is no any property for that type, let's create one
|
309
|
+
property_layer = etree.Element('property')
|
310
|
+
property_layer.set('pid','p'+str(num_props+1))
|
311
|
+
property_layer.set('lemma',my_type)
|
312
|
+
properties_layer.append(property_layer)
|
313
|
+
|
314
|
+
|
315
|
+
references = property_layer.find('references')
|
316
|
+
if references is None:
|
317
|
+
references = etree.Element('references')
|
318
|
+
property_layer.append(references)
|
319
|
+
## Create the new span
|
320
|
+
if comment is not None:
|
321
|
+
references.append(etree.Comment(comment))
|
322
|
+
span = etree.Element('span')
|
323
|
+
references.append(span)
|
324
|
+
for my_id in list_ids:
|
325
|
+
span.append(etree.Element('target',attrib={'id':my_id}))
|
326
|
+
|
327
|
+
|
328
|
+
|
329
|
+
|
330
|
+
def getSingleEntities(self):
|
331
|
+
for element in self.tree.findall('entities/entity'):
|
332
|
+
my_id = element.get('eid')
|
333
|
+
my_type = element.get('type')
|
334
|
+
my_path_to_span = None
|
335
|
+
ref = element.find('references')
|
336
|
+
if ref is not None:
|
337
|
+
my_path_to_span = 'references/span'
|
338
|
+
else:
|
339
|
+
my_path_to_span = 'span'
|
340
|
+
|
341
|
+
for span_element in element.findall(my_path_to_span):
|
342
|
+
target_ids = [target_element.get('id') for target_element in span_element.findall('target')]
|
343
|
+
my_prop = KafSingleEntity(my_id,my_type,target_ids)
|
344
|
+
yield my_prop
|
345
|
+
|
346
|
+
|
347
|
+
def getOpinions(self):
|
348
|
+
for element in self.tree.findall('opinions/opinion'):
|
349
|
+
my_id = element.get('oid')
|
350
|
+
|
351
|
+
tar_ids_hol = []
|
352
|
+
tar_ids_tar = []
|
353
|
+
polarity = strenght = ''
|
354
|
+
tar_ids_exp = []
|
355
|
+
|
356
|
+
#Holder
|
357
|
+
opi_hol_eles = element.findall('opinion_holder')
|
358
|
+
if len(opi_hol_eles)!=0:
|
359
|
+
opi_hol_ele = opi_hol_eles[0]
|
360
|
+
tar_ids_hol = [t_ele.get('id') for t_ele in opi_hol_ele.findall('span/target')]
|
361
|
+
|
362
|
+
#Target
|
363
|
+
opi_tar_eles = element.findall('opinion_target')
|
364
|
+
if len(opi_tar_eles) != 0:
|
365
|
+
opi_tar_ele = opi_tar_eles[0]
|
366
|
+
tar_ids_tar = [t_ele.get('id') for t_ele in opi_tar_ele.findall('span/target')]
|
367
|
+
|
368
|
+
## Opinion expression
|
369
|
+
opi_exp_eles = element.findall('opinion_expression')
|
370
|
+
if len(opi_exp_eles) != 0:
|
371
|
+
opi_exp_ele = opi_exp_eles[0]
|
372
|
+
polarity = opi_exp_ele.get('polarity','')
|
373
|
+
strength = opi_exp_ele.get('strength','')
|
374
|
+
tar_ids_exp = [t_ele.get('id') for t_ele in opi_exp_ele.findall('span/target')]
|
375
|
+
|
376
|
+
yield KafOpinion(my_id,tar_ids_hol, tar_ids_tar, KafOpinionExpression(polarity, strength,tar_ids_exp))
|
377
|
+
|
378
|
+
|
379
|
+
|
380
|
+
def remove_opinion_layer(self):
|
381
|
+
opinion_layer = self.tree.find('opinions')
|
382
|
+
if opinion_layer is not None:
|
383
|
+
self.tree.getroot().remove(opinion_layer)
|
384
|
+
|
385
|
+
## This function add an opinion to the opinion layer, creating it if does not exist
|
386
|
+
## The id is calculated automatically according to the number of elements and ensring there is no repetition
|
387
|
+
def add_opinion(self,hol_ids,tar_ids,polarity,strength,exp_ids):
|
388
|
+
|
389
|
+
#Looking for opinion layer or creating it
|
390
|
+
opinion_layer = self.tree.find('opinions')
|
391
|
+
if opinion_layer is None:
|
392
|
+
opinion_layer = etree.Element('opinions')
|
393
|
+
self.tree.getroot().append(opinion_layer)
|
394
|
+
|
395
|
+
## Generating unique id
|
396
|
+
list_of_oids = [opi.get('oid') for opi in opinion_layer]
|
397
|
+
|
398
|
+
n = 1
|
399
|
+
while True:
|
400
|
+
my_id = 'o'+str(n)
|
401
|
+
if my_id not in list_of_oids:
|
402
|
+
break
|
403
|
+
n += 1
|
404
|
+
#####
|
405
|
+
|
406
|
+
op_ele = etree.Element('opinion')
|
407
|
+
opinion_layer.append(op_ele)
|
408
|
+
op_ele.set('oid',my_id)
|
409
|
+
|
410
|
+
## Holder
|
411
|
+
op_hol = etree.Element('opinion_holder')
|
412
|
+
op_ele.append(op_hol)
|
413
|
+
span_op_hol = etree.Element('span')
|
414
|
+
op_hol.append(span_op_hol)
|
415
|
+
for my_id in hol_ids:
|
416
|
+
span_op_hol.append(etree.Element('target',attrib={'id':my_id}))
|
417
|
+
|
418
|
+
## TARGET
|
419
|
+
op_tar = etree.Element('opinion_target')
|
420
|
+
op_ele.append(op_tar)
|
421
|
+
span_op_tar = etree.Element('span')
|
422
|
+
op_tar.append(span_op_tar)
|
423
|
+
for my_id in tar_ids:
|
424
|
+
span_op_tar.append(etree.Element('target',attrib={'id':my_id}))
|
425
|
+
|
426
|
+
## Expression
|
427
|
+
|
428
|
+
op_exp = etree.Element('opinion_expression',attrib={'polarity':polarity,
|
429
|
+
'strength':str(strength)})
|
430
|
+
op_ele.append(op_exp)
|
431
|
+
span_exp = etree.Element('span')
|
432
|
+
op_exp.append(span_exp)
|
433
|
+
for my_id in exp_ids:
|
434
|
+
span_exp.append(etree.Element('target',attrib={'id':my_id}))
|
435
|
+
|
436
|
+
|
437
|
+
|
438
|
+
|
439
|
+
|
@@ -0,0 +1,7 @@
|
|
1
|
+
## version = 0.2
|
2
|
+
## Added timestamp to function addLinguisitcProcessor
|
3
|
+
## 24-april-2013 --> getSingleEntieies and getSingleProperties reads both entities/props in format
|
4
|
+
## entities -> entity -> span -> target and entities -> entity -> references -> span
|
5
|
+
####
|
6
|
+
|
7
|
+
from KafParserMod import KafParser
|
data/ext/hack/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require_relative 'support'
|
3
|
+
|
4
|
+
desc 'Verifies the requirements'
|
5
|
+
task :requirements do
|
6
|
+
verify_requirements
|
7
|
+
end
|
8
|
+
|
9
|
+
task :default => :requirements do
|
10
|
+
# path = File.join(PYTHON_SITE_PACKAGES, 'pre_install')
|
11
|
+
#
|
12
|
+
# pip_install(PRE_INSTALL_REQUIREMENTS, path)
|
13
|
+
end
|