opener-tree-tagger 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +150 -0
  3. data/bin/opener-tree-tagger-daemon +7 -0
  4. data/bin/opener-tree-tagger-server +11 -0
  5. data/bin/tree-tagger +7 -0
  6. data/config.ru +5 -0
  7. data/core/dutch.map.treetagger.kaf.csv +40 -0
  8. data/core/english.map.treetagger.kaf.csv +36 -0
  9. data/core/french.map.treetagger.kaf.csv +33 -0
  10. data/core/german.map.treetagger.kaf.csv +52 -0
  11. data/core/italian.map.treetagger.kaf.csv +38 -0
  12. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  13. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  14. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  15. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  16. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  17. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  18. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  19. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  20. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  21. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  22. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  23. data/core/spanish.map.treetagger.kaf.csv +75 -0
  24. data/core/token_matcher.py +82 -0
  25. data/core/tt_from_kaf_to_kaf.py +215 -0
  26. data/exec/tree-tagger.rb +9 -0
  27. data/ext/hack/Rakefile +13 -0
  28. data/ext/hack/support.rb +38 -0
  29. data/lib/opener/tree_tagger.rb +69 -0
  30. data/lib/opener/tree_tagger/cli.rb +69 -0
  31. data/lib/opener/tree_tagger/public/markdown.css +284 -0
  32. data/lib/opener/tree_tagger/server.rb +16 -0
  33. data/lib/opener/tree_tagger/version.rb +5 -0
  34. data/lib/opener/tree_tagger/views/index.erb +96 -0
  35. data/lib/opener/tree_tagger/views/result.erb +15 -0
  36. data/opener-tree-tagger.gemspec +35 -0
  37. data/pre_build_requirements.txt +1 -0
  38. metadata +197 -0
@@ -0,0 +1,439 @@
1
+ ########################################################################
2
+ # 14 Jan 2013: added function add_attrs_to_layer
3
+ ########################################################################
4
+
5
+ ###################
6
+ # List of changes #
7
+ ###################
8
+ # 14 Jan 2013: added function add_attrs_to_layer
9
+ # 27 Feb 2013: added code for comply with DTD
10
+ # 18 Jun 2013: getSingleProperties adapted to the structure KAF/features/properties/property/references/span/target
11
+ # 18 Jun 2013: funcion add_property created for adding the properties to the KAF
12
+
13
+
14
+ from lxml import etree
15
+ from KafDataObjectsMod import *
16
+ import time
17
+
18
+ class KafParser:
19
+ def __init__(self,filename=None):
20
+ self.tree=None
21
+ self.__pathForToken={}
22
+ self.__term_ids_for_token_id = None
23
+
24
+ if filename:
25
+ #self.tree = etree.parse(filename,etree.XMLParser(remove_blank_text=True))
26
+ self.tree = etree.parse(filename,etree.XMLParser(remove_blank_text=True, strip_cdata=False))
27
+ ## Do the text tokenization
28
+ self.__textTokenization()
29
+ else:
30
+ root = etree.Element('KAF')
31
+ root.set('version','v1.opener')
32
+ root.set('{http://www.w3.org/XML/1998/namespace}lang','en')
33
+ self.tree = etree.ElementTree(element=root)
34
+
35
+ def __textTokenization(self):
36
+ for wf in self.tree.findall('text/wf'):
37
+ wid = wf.get('wid')
38
+ self.__pathForToken[wid] = self.tree.getpath(wf)
39
+
40
+
41
+ def getToken(self,tid):
42
+ if tid in self.__pathForToken:
43
+ path = self.__pathForToken[tid]
44
+ return self.tree.xpath(self.__pathForToken[tid])[0]
45
+ return None
46
+
47
+
48
+ def getLanguage(self):
49
+ lang = self.tree.getroot().get('{http://www.w3.org/XML/1998/namespace}lang','nl')
50
+ return lang
51
+
52
+ ## Return a list of (sentence_id, TOKENS) where tokens is a list of (token_id,token)
53
+ ## [(s_id1, T1), (sent_id2, T2)....]
54
+ ## T1 --> [(tokenid, token), (tokenid2,token2)....]
55
+ def get_tokens_in_sentences(self):
56
+ sents = []
57
+ current = []
58
+ previous_sent = None
59
+ for element in self.tree.findall('text/wf'):
60
+ w_id = element.get('wid')
61
+ s_id = element.get('sent')
62
+ word = element.text
63
+
64
+ if previous_sent is not None and s_id != previous_sent:
65
+ sents.append((previous_sent,current))
66
+ current = []
67
+ current.append((w_id,word))
68
+ previous_sent = s_id
69
+ ####
70
+ sents.append((s_id,current))
71
+ return sents
72
+
73
+ def get_term_ids_for_token_id(self,tok_id):
74
+ if self.__term_ids_for_token_id is None:
75
+ self.__term_ids_for_token_id = {}
76
+ for element in self.tree.findall('terms/term'):
77
+ term_id = element.get('tid')
78
+ for target in element.findall('span/target'):
79
+ token_id = target.get('id')
80
+ if token_id not in self.__term_ids_for_token_id:
81
+ self.__term_ids_for_token_id[token_id] = [term_id]
82
+ else:
83
+ self.__term_ids_for_token_id[token_id].append(term_id)
84
+ return self.__term_ids_for_token_id.get(tok_id,[])
85
+
86
+
87
+
88
+ def getTokens(self):
89
+ for element in self.tree.findall('text/wf'):
90
+ w_id = element.get('wid')
91
+ s_id = element.get('sent','0')
92
+ word = element.text
93
+ yield (word, s_id, w_id)
94
+
95
+
96
+
97
+ def getTerms(self):
98
+ if self.tree:
99
+ for element in self.tree.findall('terms/term'):
100
+ kafTermObj = KafTerm()
101
+ kafTermObj.setId(element.get('tid'))
102
+ kafTermObj.setLemma(element.get('lemma'))
103
+ kafTermObj.setPos(element.get('pos'))
104
+ kafTermObj.morphofeat = element.get('morphofeat')
105
+
106
+ ## Parsing sentiment
107
+ sentiment = element.find('sentiment')
108
+ if sentiment is not None:
109
+ resource = sentiment.get('resource','')
110
+ polarity = sentiment.get('polarity',None)
111
+ strength = sentiment.get('strength','')
112
+ subjectivity = sentiment.get('subjectivity','')
113
+ sentiment_modifier = sentiment.get('sentiment_modifier')
114
+
115
+ my_sent = KafTermSentiment()
116
+ my_sent.simpleInit(resource,polarity,strength,subjectivity,sentiment_modifier)
117
+ kafTermObj.setSentiment(my_sent)
118
+
119
+ ## Parsing the span
120
+ span = element.find('span')
121
+ if span is not None:
122
+ list_ids = [target.get('id') for target in span.findall('target')]
123
+ kafTermObj.set_list_span_id(list_ids)
124
+
125
+
126
+ yield kafTermObj
127
+ else:
128
+ return
129
+
130
+
131
+ def getSentimentTriples(self):
132
+ data = []
133
+ if self.tree:
134
+ for term_element in self.tree.findall('terms/term'):
135
+ lemma = term_element.get('lemma')
136
+ polarity = None
137
+ sentiment_modifier = None
138
+
139
+ sentiment_element = term_element.find('sentiment')
140
+ if sentiment_element is not None:
141
+ polarity = sentiment_element.get('polarity',None)
142
+ sentiment_modifier = sentiment_element.get('sentiment_modifier')
143
+ data.append( (lemma,polarity,sentiment_modifier))
144
+ return data
145
+
146
+
147
+
148
+ def addPolarityToTerm(self,termid,my_sentiment_attribs,polarity_pos=None):
149
+ if self.tree:
150
+ for element in self.tree.find('terms'):
151
+ if element.get('tid','')==termid:
152
+
153
+ #In case there is no pos info, we use the polarityPos
154
+ if not element.get('pos') and polarity_pos is not None:
155
+ element.set('pos',polarity_pos)
156
+ sentEle = etree.Element('sentiment',attrib=my_sentiment_attribs)
157
+ element.append(sentEle)
158
+
159
+ def saveToFile(self,filename,myencoding='UTF-8'):
160
+ if self.tree:
161
+ self.tree.write(filename,encoding=myencoding,pretty_print=True,xml_declaration=True)
162
+
163
+
164
+ def addLinguisticProcessor(self,name,version, layer, time_stamp=True):
165
+ aux = self.tree.findall('kafHeader')
166
+ if len(aux)!=0:
167
+ kaf_header = aux[0]
168
+ else:
169
+ kaf_header = etree.Element('kafHeader')
170
+ self.tree.getroot().insert(0,kaf_header)
171
+
172
+ aux2= kaf_header.findall('linguisticProcessors')
173
+ if len(aux2) == 0:
174
+ new_lp = etree.Element('linguisticProcessors')
175
+ new_lp.set('layer',layer)
176
+ kaf_header.append(new_lp)
177
+
178
+ ## Check if there is already element for the layer
179
+ my_lp_ele = None
180
+
181
+ for element in kaf_header.findall('linguisticProcessors'):
182
+ if element.get('layer','')==layer:
183
+ my_lp_ele = element
184
+ break
185
+
186
+ if time_stamp:
187
+ my_time = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
188
+ else:
189
+ my_time = '*'
190
+
191
+ my_lp = etree.Element('lp')
192
+ my_lp.set('timestamp',my_time)
193
+ my_lp.set('version',version)
194
+ my_lp.set('name',name)
195
+
196
+ if my_lp_ele is not None: #Already an element for linguisticProcessor with the layer
197
+ my_lp_ele.append(my_lp)
198
+ else:
199
+ # Create a new element for the LP layer
200
+ my_lp_ele = etree.Element('linguisticProcessors')
201
+ my_lp_ele.set('layer',layer)
202
+ my_lp_ele.append(my_lp)
203
+ #my_lp_ele.tail=my_lp_ele.text='\n'
204
+ ## Should be inserted after the last linguisticProcessor element (stored in variable element)
205
+ idx = kaf_header.index(element)
206
+ kaf_header.insert(idx+1,my_lp_ele)
207
+
208
+
209
+ def addLayer(self,type,element,first_char_id=None):
210
+ if first_char_id is None:
211
+ first_char_id = type[0]
212
+
213
+ ## Check if there is already layer for the type
214
+ layer_element = self.tree.find(type)
215
+
216
+ if layer_element is None:
217
+ layer_element = etree.Element(type)
218
+ self.tree.getroot().append(layer_element)
219
+ ## The id is going to be the first one
220
+ new_id = first_char_id+'1'
221
+ else:
222
+ ## We need to know how many elements there are in the layer
223
+ current_n = len(layer_element.getchildren())
224
+ new_id = first_char_id+''+str(current_n+1)
225
+
226
+
227
+ ## In this point layer_element points to the correct element, existing or created
228
+
229
+ element.set(first_char_id+'id',new_id)
230
+ layer_element.append(element)
231
+ return new_id
232
+
233
+ def addElementToLayer(self,layer, element,first_char_id=None):
234
+ return self.addLayer(layer,element,first_char_id)
235
+
236
+ def add_attrs_to_layer(self,layer,attrs):
237
+ layer_element = self.tree.find(layer)
238
+ if layer_element is not None:
239
+ for att, val in attrs.items():
240
+ layer_element.set(att,val)
241
+
242
+
243
+ def addAttributeToElement(self,path,str_id, id, attribute, value,sub_path=None):
244
+ for element in self.tree.findall(path):
245
+ if id is not None and element.get(str_id,None) == id:
246
+ if sub_path is not None:
247
+ elements = element.findall(sub_path)
248
+ if len(elements)!=0: element = elements[0]
249
+ element.set(attribute,value)
250
+ return
251
+
252
+
253
+ ## This works with the original definition of the property layer
254
+ ## KAF -> properties -> property* -> span* -> target*
255
+ def getSingleProperties_old(self):
256
+ for element in self.tree.findall('properties/property'):
257
+ my_id = element.get('pid')
258
+ my_type = element.get('type')
259
+ ref = element.find('references')
260
+ if ref is not None:
261
+ element = ref
262
+ for span_element in element.findall('span'):
263
+ target_ids = [target_element.get('id') for target_element in span_element.findall('target')]
264
+ my_prop = KafSingleProperty(my_id,my_type,target_ids)
265
+ yield my_prop
266
+
267
+ ## 18-June-2013
268
+ def getSingleProperties(self):
269
+ for property in self.tree.findall('features/properties/property'):
270
+ my_id = property.get('pid')
271
+ if my_id is None:
272
+ my_id = property.get('fpid')
273
+ my_type = property.get('lemma')
274
+ for span_element in property.findall('references/span'):
275
+ target_ids = [target_element.get('id') for target_element in span_element.findall('target')]
276
+ my_prop = KafSingleProperty(my_id,my_type,target_ids)
277
+ yield my_prop
278
+
279
+ # This function adds a new property of the type given with the list of ids given
280
+ # my_type -> 'sleeping comfort' list_ids = ['id1','id2']
281
+ # It creates the features/properties layers in case
282
+ # Agglomerates all the properties for the same TYPE under the same property element
283
+ # It calculates automatically the number for the identifier depending on the number
284
+ # of properties existing
285
+ def add_property(self,my_type,list_ids,comment=None):
286
+
287
+ #Looking for feature layer or creating it
288
+ feature_layer = self.tree.find('features')
289
+ if feature_layer is None:
290
+ feature_layer = etree.Element('features')
291
+ self.tree.getroot().append(feature_layer)
292
+
293
+ #Looking for properties layer
294
+ properties_layer = feature_layer.find('properties')
295
+ if properties_layer is None:
296
+ properties_layer = etree.Element('properties')
297
+ feature_layer.append(properties_layer)
298
+
299
+ num_props = 0
300
+ property_layer = None
301
+ for property in properties_layer.findall('property'):
302
+ num_props += 1
303
+ prop_type = property.get('lemma')
304
+ if prop_type == my_type:
305
+ property_layer = property
306
+ break
307
+
308
+ if property_layer is None: # There is no any property for that type, let's create one
309
+ property_layer = etree.Element('property')
310
+ property_layer.set('pid','p'+str(num_props+1))
311
+ property_layer.set('lemma',my_type)
312
+ properties_layer.append(property_layer)
313
+
314
+
315
+ references = property_layer.find('references')
316
+ if references is None:
317
+ references = etree.Element('references')
318
+ property_layer.append(references)
319
+ ## Create the new span
320
+ if comment is not None:
321
+ references.append(etree.Comment(comment))
322
+ span = etree.Element('span')
323
+ references.append(span)
324
+ for my_id in list_ids:
325
+ span.append(etree.Element('target',attrib={'id':my_id}))
326
+
327
+
328
+
329
+
330
+ def getSingleEntities(self):
331
+ for element in self.tree.findall('entities/entity'):
332
+ my_id = element.get('eid')
333
+ my_type = element.get('type')
334
+ my_path_to_span = None
335
+ ref = element.find('references')
336
+ if ref is not None:
337
+ my_path_to_span = 'references/span'
338
+ else:
339
+ my_path_to_span = 'span'
340
+
341
+ for span_element in element.findall(my_path_to_span):
342
+ target_ids = [target_element.get('id') for target_element in span_element.findall('target')]
343
+ my_prop = KafSingleEntity(my_id,my_type,target_ids)
344
+ yield my_prop
345
+
346
+
347
+ def getOpinions(self):
348
+ for element in self.tree.findall('opinions/opinion'):
349
+ my_id = element.get('oid')
350
+
351
+ tar_ids_hol = []
352
+ tar_ids_tar = []
353
+ polarity = strenght = ''
354
+ tar_ids_exp = []
355
+
356
+ #Holder
357
+ opi_hol_eles = element.findall('opinion_holder')
358
+ if len(opi_hol_eles)!=0:
359
+ opi_hol_ele = opi_hol_eles[0]
360
+ tar_ids_hol = [t_ele.get('id') for t_ele in opi_hol_ele.findall('span/target')]
361
+
362
+ #Target
363
+ opi_tar_eles = element.findall('opinion_target')
364
+ if len(opi_tar_eles) != 0:
365
+ opi_tar_ele = opi_tar_eles[0]
366
+ tar_ids_tar = [t_ele.get('id') for t_ele in opi_tar_ele.findall('span/target')]
367
+
368
+ ## Opinion expression
369
+ opi_exp_eles = element.findall('opinion_expression')
370
+ if len(opi_exp_eles) != 0:
371
+ opi_exp_ele = opi_exp_eles[0]
372
+ polarity = opi_exp_ele.get('polarity','')
373
+ strength = opi_exp_ele.get('strength','')
374
+ tar_ids_exp = [t_ele.get('id') for t_ele in opi_exp_ele.findall('span/target')]
375
+
376
+ yield KafOpinion(my_id,tar_ids_hol, tar_ids_tar, KafOpinionExpression(polarity, strength,tar_ids_exp))
377
+
378
+
379
+
380
+ def remove_opinion_layer(self):
381
+ opinion_layer = self.tree.find('opinions')
382
+ if opinion_layer is not None:
383
+ self.tree.getroot().remove(opinion_layer)
384
+
385
+ ## This function add an opinion to the opinion layer, creating it if does not exist
386
+ ## The id is calculated automatically according to the number of elements and ensring there is no repetition
387
+ def add_opinion(self,hol_ids,tar_ids,polarity,strength,exp_ids):
388
+
389
+ #Looking for opinion layer or creating it
390
+ opinion_layer = self.tree.find('opinions')
391
+ if opinion_layer is None:
392
+ opinion_layer = etree.Element('opinions')
393
+ self.tree.getroot().append(opinion_layer)
394
+
395
+ ## Generating unique id
396
+ list_of_oids = [opi.get('oid') for opi in opinion_layer]
397
+
398
+ n = 1
399
+ while True:
400
+ my_id = 'o'+str(n)
401
+ if my_id not in list_of_oids:
402
+ break
403
+ n += 1
404
+ #####
405
+
406
+ op_ele = etree.Element('opinion')
407
+ opinion_layer.append(op_ele)
408
+ op_ele.set('oid',my_id)
409
+
410
+ ## Holder
411
+ op_hol = etree.Element('opinion_holder')
412
+ op_ele.append(op_hol)
413
+ span_op_hol = etree.Element('span')
414
+ op_hol.append(span_op_hol)
415
+ for my_id in hol_ids:
416
+ span_op_hol.append(etree.Element('target',attrib={'id':my_id}))
417
+
418
+ ## TARGET
419
+ op_tar = etree.Element('opinion_target')
420
+ op_ele.append(op_tar)
421
+ span_op_tar = etree.Element('span')
422
+ op_tar.append(span_op_tar)
423
+ for my_id in tar_ids:
424
+ span_op_tar.append(etree.Element('target',attrib={'id':my_id}))
425
+
426
+ ## Expression
427
+
428
+ op_exp = etree.Element('opinion_expression',attrib={'polarity':polarity,
429
+ 'strength':str(strength)})
430
+ op_ele.append(op_exp)
431
+ span_exp = etree.Element('span')
432
+ op_exp.append(span_exp)
433
+ for my_id in exp_ids:
434
+ span_exp.append(etree.Element('target',attrib={'id':my_id}))
435
+
436
+
437
+
438
+
439
+
@@ -0,0 +1,7 @@
1
+ ## version = 0.2
2
+ ## Added timestamp to function addLinguisitcProcessor
3
+ ## 24-april-2013 --> getSingleEntieies and getSingleProperties reads both entities/props in format
4
+ ## entities -> entity -> span -> target and entities -> entity -> references -> span
5
+ ####
6
+
7
+ from KafParserMod import KafParser