opener-property-tagger 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +62 -0
- data/bin/property-tagger +7 -0
- data/bin/property-tagger-server +8 -0
- data/config.ru +5 -0
- data/core/extract_aspects.py +18 -0
- data/core/hotel_property_tagger_nl_en.py +133 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/ext/hack/Rakefile +13 -0
- data/ext/hack/support.rb +38 -0
- data/lib/opener/property_tagger.rb +86 -0
- data/lib/opener/property_tagger/cli.rb +84 -0
- data/lib/opener/property_tagger/public/markdown.css +284 -0
- data/lib/opener/property_tagger/server.rb +16 -0
- data/lib/opener/property_tagger/version.rb +5 -0
- data/lib/opener/property_tagger/views/index.erb +97 -0
- data/lib/opener/property_tagger/views/result.erb +15 -0
- data/opener-property-tagger.gemspec +37 -0
- data/pre_build_requirements.txt +1 -0
- metadata +183 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 449add6937b48b2b9c17a8174cc5790a2d366088
|
4
|
+
data.tar.gz: b64fc05e7b01f9720969607637e94e891a57c614
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 24d9e2caf35674b6e33e495ba84467198b6adbfcfe3dab50e586e6ec6467e61b24e73810164a426ef7d4c7fdbecbea90255d0bc0caeff09585015505ae13d16f
|
7
|
+
data.tar.gz: 7969b6cb5409ab4c6141e68ef708ac85fa9fa8169c476c696fe40bc5c6452dfc3cdd590cce40bf306370d1a7f7f1901f19206d53cef63015eaa79fedfea31080
|
data/README.md
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
#property-tagger#
|
2
|
+
|
3
|
+
#Introduction#
|
4
|
+
|
5
|
+
This module implements a tagger for hotel properties for Dutch, English, French, Italian, Spanish and German. It detects aspect words,
|
6
|
+
for instance words related with "room", "cleanliness", "staff" or "breakfast" and links them with the correct aspect class.
|
7
|
+
The input for this module has to be a valid KAF file with at lest the term layer, as the lemmas will be used for detecting the hotel properties. The output is also
|
8
|
+
a KAF valid file extended with the property layer. This module works for all the languages within the OpeNER project (en,de,nl,fr,es,it) and the language is read from
|
9
|
+
the input KAF file, from the lang attribute of the KAF element (make suresure your preprocessors set properly this value or you might use the resources for a wrong language)
|
10
|
+
````shell
|
11
|
+
<KAF xml:lang="( en | nl | fr | it | de | es)">
|
12
|
+
````
|
13
|
+
|
14
|
+
#Requirements#
|
15
|
+
* VUKafParserPy: parser in python for KAF files (https://github.com/opener-project/VU-kaf-parser)
|
16
|
+
* lxml: library for processing xml in python
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
#How to run the module with Python#
|
21
|
+
You can run this module from the command line using Python. The main script is core/hotel_property_tagger_nl_en.py. This reads
|
22
|
+
and writes the output to the standard output, generating some log information in the standard error output. To process the file
|
23
|
+
"input.kaf" just type:
|
24
|
+
````shell
|
25
|
+
cat input.kaf | core/hotel_property_tagger_nl_en.py > output.kaf
|
26
|
+
````
|
27
|
+
|
28
|
+
This will read the KAF file in "input.kaf" and will store the constituent trees in "input.tree".
|
29
|
+
|
30
|
+
The options of the program can be obtained calling to the program with the parameter -h:
|
31
|
+
````shell
|
32
|
+
core/hotel_property_tagger_nl_en.py -h
|
33
|
+
usage: hotel_property_tagger_nl_en.py [-h] [--no-time] [--lexicon LEXICON]
|
34
|
+
|
35
|
+
Tags a text with polarities at lemma level
|
36
|
+
|
37
|
+
optional arguments:
|
38
|
+
-h, --help show this help message and exit
|
39
|
+
--no-time For not including timestamp in header
|
40
|
+
--lexicon LEXICON Force to use this lexicon
|
41
|
+
````
|
42
|
+
|
43
|
+
The most important option is the --lexicon, which allows you to use your own lexicon with the program. The lexicon must be stored in a file and follow this format:
|
44
|
+
````shell
|
45
|
+
surf verb facilities
|
46
|
+
surfer noun facilities
|
47
|
+
surfing verb facilities
|
48
|
+
```
|
49
|
+
|
50
|
+
So, one aspect per line, with 3 fields separated by a tabulator, the first one is the word or span of words (in this case use whitespaces), then the part
|
51
|
+
of speech (which actually it is not use, you can include a dummy label) and finally the aspect class associated with the word. If you have created your lexicon
|
52
|
+
in one file you could call to the program in this fashion:
|
53
|
+
````shell
|
54
|
+
$ cat my_input.kaf | python core/hotel_property_tagger_nl_en.py --lexicon path/to/your/lexicon/my_lexicon.txt > my_output.kaf
|
55
|
+
````
|
56
|
+
|
57
|
+
Contact
|
58
|
+
------
|
59
|
+
* Ruben Izquierdo
|
60
|
+
* Vrije University of Amsterdam
|
61
|
+
* ruben.izquierdobevia@vu.nl
|
62
|
+
|
data/bin/property-tagger
ADDED
data/config.ru
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
from lxml import etree
|
4
|
+
import sys
|
5
|
+
#filename='/Users/ruben/CODE/VU-sentiment-lexicon-xml/VUSentimentLexicon/EN-lexicon/Sentiment-English-HotelDomain.xml'
|
6
|
+
|
7
|
+
root = etree.parse(sys.stdin).getroot()
|
8
|
+
|
9
|
+
for element in root.findall('Lexicon/LexicalEntry'):
|
10
|
+
ele_lemma = element.findall('Lemma')[0]
|
11
|
+
ele_domain = element.findall('Sense/Domain')[0]
|
12
|
+
pos = element.get('partOfSpeech','unknown_pos')
|
13
|
+
if ele_lemma is not None and ele_domain is not None:
|
14
|
+
lemma = ele_lemma.get('writtenForm','').lower()
|
15
|
+
aspect = ele_domain.get('aspect','').lower()
|
16
|
+
if lemma!='' and aspect!='':
|
17
|
+
print lemma.encode('utf-8')+'\t'+pos.encode('utf-8')+'\t'+aspect.encode('utf-8')
|
18
|
+
|
@@ -0,0 +1,133 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import sys
|
4
|
+
import argparse
|
5
|
+
|
6
|
+
import codecs
|
7
|
+
import os
|
8
|
+
|
9
|
+
this_folder = os.path.dirname(os.path.realpath(__file__))
|
10
|
+
|
11
|
+
# This updates the load path to ensure that the local site-packages directory
|
12
|
+
# can be used to load packages (e.g. a locally installed copy of lxml).
|
13
|
+
sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
|
14
|
+
|
15
|
+
from VUKafParserPy import KafParser
|
16
|
+
from lxml import etree
|
17
|
+
from collections import defaultdict
|
18
|
+
|
19
|
+
__desc='VUA property tagger'
|
20
|
+
__last_edited='4nov2013'
|
21
|
+
__version='1.0'
|
22
|
+
|
23
|
+
###
|
24
|
+
__module_dir = os.path.dirname(__file__)
|
25
|
+
max_ngram = 1
|
26
|
+
verbose = False
|
27
|
+
##
|
28
|
+
|
29
|
+
|
30
|
+
########################################
|
31
|
+
## Format of the file:
|
32
|
+
#lemma pos aspect
|
33
|
+
#lemma pos aspect
|
34
|
+
########################################
|
35
|
+
def loadAspects(my_lang,this_file=None):
|
36
|
+
my_aspects = {}
|
37
|
+
if this_file is not None:
|
38
|
+
aspects_filename = this_file
|
39
|
+
else:
|
40
|
+
aspects_filename = os.path.join(__module_dir,'data',my_lang,'aspects.txt')
|
41
|
+
|
42
|
+
if not os.path.exists(aspects_filename):
|
43
|
+
print>>sys.stderr,'ERROR: file with aspects for the language',my_lang,'not found in',aspects_filename
|
44
|
+
else:
|
45
|
+
fic = codecs.open(aspects_filename,'r','utf-8')
|
46
|
+
for line in fic:
|
47
|
+
fields = line.strip().split('\t')
|
48
|
+
lemma,pos,aspect = fields
|
49
|
+
my_aspects[lemma] = aspect
|
50
|
+
fic.close()
|
51
|
+
return aspects_filename, my_aspects
|
52
|
+
########################################
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
###### MAIN ########
|
57
|
+
|
58
|
+
argument_parser = argparse.ArgumentParser(description='Tags a text with polarities at lemma level')
|
59
|
+
argument_parser.add_argument("--no-time",action="store_false", default=True, dest="my_time_stamp",help="For not including timestamp in header")
|
60
|
+
argument_parser.add_argument("--lexicon", action="store", default=None, dest="lexicon", help="Force to use this lexicon")
|
61
|
+
|
62
|
+
arguments = argument_parser.parse_args()
|
63
|
+
|
64
|
+
if not sys.stdin.isatty():
|
65
|
+
## READING FROM A PIPE
|
66
|
+
pass
|
67
|
+
else:
|
68
|
+
print>>sys.stderr,'Input stream required.'
|
69
|
+
print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
|
70
|
+
print>>sys.stderr,sys.argv[0]+' -h for help'
|
71
|
+
sys.exit(-1)
|
72
|
+
|
73
|
+
|
74
|
+
## Load the tree and the list of terms with the id
|
75
|
+
my_data = []
|
76
|
+
try:
|
77
|
+
my_kaf_tree = KafParser(sys.stdin)
|
78
|
+
except Exception as e:
|
79
|
+
print>>sys.stdout,'Error parsing input. Input is required to be KAF'
|
80
|
+
print>>sys.stdout,str(e)
|
81
|
+
sys.exit(2)
|
82
|
+
|
83
|
+
|
84
|
+
## Get language from the KAF file
|
85
|
+
my_lang = my_kaf_tree.getLanguage()
|
86
|
+
|
87
|
+
my_aspects_filename = my_aspects = None
|
88
|
+
if arguments.lexicon is None:
|
89
|
+
if my_lang not in ['nl','en','de','fr','it','es']:
|
90
|
+
print>>sys.stdout,'Error in the language specified in your KAF. The language is ',my_lang,' and possible values for this module '
|
91
|
+
print>>sys.stdout,'are nl for Dutch ,en for English, es Spanish, fr French, it Italian or de German'
|
92
|
+
sys.exit(1)
|
93
|
+
|
94
|
+
my_aspects_filename, my_aspects = loadAspects(my_lang)
|
95
|
+
else:
|
96
|
+
my_aspects_filename, my_aspects = loadAspects(my_lang,this_file=arguments.lexicon)
|
97
|
+
|
98
|
+
if verbose:
|
99
|
+
print>>sys.stderr,'Loaded ',len(my_aspects),'aspects from',my_aspects_filename
|
100
|
+
|
101
|
+
|
102
|
+
for term in my_kaf_tree.getTerms():
|
103
|
+
my_data.append((term.getLemma(),term.getId()))
|
104
|
+
if verbose: print>>sys.stderr,'Number of terms in the kaf file:',len(my_data)
|
105
|
+
|
106
|
+
|
107
|
+
current_token = found = 0
|
108
|
+
uniq_aspects = defaultdict(list)
|
109
|
+
while current_token < len(my_data):
|
110
|
+
for tam_ngram in range(1,max_ngram+1):
|
111
|
+
# Build an n-gram of size tam_ngram and beginning in current_token
|
112
|
+
if current_token + tam_ngram <= len(my_data):
|
113
|
+
ngram = ' '.join(lemma for lemma,_ in my_data[current_token:current_token+tam_ngram])
|
114
|
+
aspect = my_aspects.get(ngram,None)
|
115
|
+
if aspect is not None:
|
116
|
+
list_of_ids = [id for _,id in my_data[current_token:current_token+tam_ngram]]
|
117
|
+
uniq_aspects[aspect].append((list_of_ids,ngram))
|
118
|
+
current_token += 1
|
119
|
+
|
120
|
+
|
121
|
+
## Code for generating the propery layer included in the Parser
|
122
|
+
for aspect, list_of_lists in uniq_aspects.items():
|
123
|
+
for list_of_ids, str_text in list_of_lists:
|
124
|
+
my_kaf_tree.add_property(aspect,list_of_ids,str_text)
|
125
|
+
|
126
|
+
my_kaf_tree.addLinguisticProcessor(__desc,__last_edited+'_'+__version,'features', arguments.my_time_stamp)
|
127
|
+
my_kaf_tree.saveToFile(sys.stdout)
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
../VUKafParserPy/KafParserMod.py
|
2
|
+
../VUKafParserPy/__init__.py
|
3
|
+
../VUKafParserPy/KafDataObjectsMod.py
|
4
|
+
../VUKafParserPy/KafParserMod.pyc
|
5
|
+
../VUKafParserPy/__init__.pyc
|
6
|
+
../VUKafParserPy/KafDataObjectsMod.pyc
|
7
|
+
./
|
8
|
+
top_level.txt
|
9
|
+
SOURCES.txt
|
10
|
+
PKG-INFO
|
11
|
+
dependency_links.txt
|
@@ -0,0 +1 @@
|
|
1
|
+
VUKafParserPy
|
@@ -0,0 +1,165 @@
|
|
1
|
+
class KafTermSentiment:
|
2
|
+
def __init__(self):
|
3
|
+
self.resource=None
|
4
|
+
self.polarity=None
|
5
|
+
self.strength=None
|
6
|
+
self.subjectivity=None
|
7
|
+
|
8
|
+
def simpleInit(self,r,p,st,su,sm=None):
|
9
|
+
self.resource=r
|
10
|
+
self.polarity=p
|
11
|
+
self.strength=st
|
12
|
+
self.subjectivity=su
|
13
|
+
self.sentiment_modifier = sm
|
14
|
+
|
15
|
+
def getPolarity(self):
|
16
|
+
return self.polarity
|
17
|
+
|
18
|
+
def getSentimentModifier(self):
|
19
|
+
return self.sentiment_modifier
|
20
|
+
|
21
|
+
|
22
|
+
class KafToken:
|
23
|
+
def __init__(self,wid, value, sent=None, para=None):
|
24
|
+
self.token_id = wid
|
25
|
+
self.value = value
|
26
|
+
self.sent = sent
|
27
|
+
self.para = para
|
28
|
+
|
29
|
+
|
30
|
+
class KafOpinionExpression:
|
31
|
+
def __init__(self,polarity,strength,targets):
|
32
|
+
self.polarity = polarity
|
33
|
+
self.strength = strength
|
34
|
+
self.targets = targets
|
35
|
+
|
36
|
+
def __str__(self):
|
37
|
+
return 'Op_exp==> pol:'+self.polarity+' Str:'+self.strength+' ids:'+'-'.join(self.targets)
|
38
|
+
|
39
|
+
class KafOpinion:
|
40
|
+
def __init__(self,id,holders, targets, opi_exp):
|
41
|
+
self.id = id
|
42
|
+
self.holders = holders
|
43
|
+
self.targets = targets
|
44
|
+
self.opi_exp = opi_exp
|
45
|
+
|
46
|
+
def __str__(self):
|
47
|
+
c='Opinion id'+self.id+'\n'
|
48
|
+
c+=' Holders: '+'-'.join(self.holders)+'\n'
|
49
|
+
c+=' Targets: '+'-'.join(self.targets)+'\n'
|
50
|
+
c+=str(self.opi_exp)
|
51
|
+
return c
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
class KafSingleProperty:
|
56
|
+
def __init__(self,id,type,targets):
|
57
|
+
self.id = id
|
58
|
+
self.type = type
|
59
|
+
self.targets = targets
|
60
|
+
|
61
|
+
|
62
|
+
def get_id(self):
|
63
|
+
return self.id
|
64
|
+
|
65
|
+
def get_type(self):
|
66
|
+
return self.type
|
67
|
+
|
68
|
+
def get_span(self):
|
69
|
+
return self.targets
|
70
|
+
|
71
|
+
def __str__(self):
|
72
|
+
return 'Id: '+self.id+' Type: '+self.type+' ids:'+' '.join(self.targets)
|
73
|
+
|
74
|
+
|
75
|
+
class KafSingleEntity:
|
76
|
+
def __init__(self,id,type,targets):
|
77
|
+
self.id = id
|
78
|
+
self.type = type
|
79
|
+
self.targets = targets
|
80
|
+
|
81
|
+
def get_id(self):
|
82
|
+
return self.id
|
83
|
+
|
84
|
+
def get_type(self):
|
85
|
+
return self.type
|
86
|
+
|
87
|
+
def get_span(self):
|
88
|
+
return self.targets
|
89
|
+
|
90
|
+
def __str__(self):
|
91
|
+
return 'Id: '+self.id+' Type: '+self.type+' ids:'+' '.join(self.targets)
|
92
|
+
|
93
|
+
class KafTerm:
|
94
|
+
def __init__(self):
|
95
|
+
self.tid = None
|
96
|
+
self.lemma = None
|
97
|
+
self.pos = None
|
98
|
+
self.morphofeat = None
|
99
|
+
self.sentiment = None
|
100
|
+
self.list_span_id = []
|
101
|
+
|
102
|
+
def get_morphofeat(self):
|
103
|
+
return self.morphofeat
|
104
|
+
|
105
|
+
def set_list_span_id(self, L):
|
106
|
+
self.list_span_id = L
|
107
|
+
|
108
|
+
def get_list_span(self):
|
109
|
+
return self.list_span_id
|
110
|
+
|
111
|
+
def get_polarity(self):
|
112
|
+
if self.sentiment != None:
|
113
|
+
return self.sentiment.getPolarity()
|
114
|
+
else:
|
115
|
+
return None
|
116
|
+
|
117
|
+
def get_sentiment_modifier(self):
|
118
|
+
if self.sentiment != None:
|
119
|
+
return self.sentiment.getSentimentModifier()
|
120
|
+
else:
|
121
|
+
return None
|
122
|
+
|
123
|
+
|
124
|
+
def setSentiment(self,my_sent):
|
125
|
+
self.sentiment = my_sent
|
126
|
+
|
127
|
+
def getSentiment(self):
|
128
|
+
return self.sentiment
|
129
|
+
|
130
|
+
def getLemma(self):
|
131
|
+
return self.lemma
|
132
|
+
|
133
|
+
def setLemma(self,lemma):
|
134
|
+
self.lemma = lemma
|
135
|
+
|
136
|
+
def getPos(self):
|
137
|
+
return self.pos
|
138
|
+
|
139
|
+
def setPos(self,pos):
|
140
|
+
self.pos = pos
|
141
|
+
|
142
|
+
def getId(self):
|
143
|
+
return self.tid
|
144
|
+
|
145
|
+
def setId(self,id):
|
146
|
+
self.tid = id
|
147
|
+
|
148
|
+
def getShortPos(self):
|
149
|
+
if self.pos==None:
|
150
|
+
return None
|
151
|
+
auxpos=self.pos.lower()[0]
|
152
|
+
if auxpos == 'g': auxpos='a'
|
153
|
+
elif auxpos == 'a': auxpos='r'
|
154
|
+
return auxpos
|
155
|
+
|
156
|
+
def __str__(self):
|
157
|
+
if self.tid and self.lemma and self.pos:
|
158
|
+
return self.tid+'\n\t'+self.lemma.encode('utf-8')+'\n\t'+self.pos
|
159
|
+
else:
|
160
|
+
return 'None'
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
|
165
|
+
|