opener-property-tagger 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +62 -0
- data/bin/property-tagger +7 -0
- data/bin/property-tagger-server +8 -0
- data/config.ru +5 -0
- data/core/extract_aspects.py +18 -0
- data/core/hotel_property_tagger_nl_en.py +133 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/ext/hack/Rakefile +13 -0
- data/ext/hack/support.rb +38 -0
- data/lib/opener/property_tagger.rb +86 -0
- data/lib/opener/property_tagger/cli.rb +84 -0
- data/lib/opener/property_tagger/public/markdown.css +284 -0
- data/lib/opener/property_tagger/server.rb +16 -0
- data/lib/opener/property_tagger/version.rb +5 -0
- data/lib/opener/property_tagger/views/index.erb +97 -0
- data/lib/opener/property_tagger/views/result.erb +15 -0
- data/opener-property-tagger.gemspec +37 -0
- data/pre_build_requirements.txt +1 -0
- metadata +183 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 449add6937b48b2b9c17a8174cc5790a2d366088
|
4
|
+
data.tar.gz: b64fc05e7b01f9720969607637e94e891a57c614
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 24d9e2caf35674b6e33e495ba84467198b6adbfcfe3dab50e586e6ec6467e61b24e73810164a426ef7d4c7fdbecbea90255d0bc0caeff09585015505ae13d16f
|
7
|
+
data.tar.gz: 7969b6cb5409ab4c6141e68ef708ac85fa9fa8169c476c696fe40bc5c6452dfc3cdd590cce40bf306370d1a7f7f1901f19206d53cef63015eaa79fedfea31080
|
data/README.md
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
#property-tagger#
|
2
|
+
|
3
|
+
#Introduction#
|
4
|
+
|
5
|
+
This module implements a tagger for hotel properties for Dutch, English, French, Italian, Spanish and German. It detects aspect words,
|
6
|
+
for instance words related with "room", "cleanliness", "staff" or "breakfast" and links them with the correct aspect class.
|
7
|
+
The input for this module has to be a valid KAF file with at lest the term layer, as the lemmas will be used for detecting the hotel properties. The output is also
|
8
|
+
a KAF valid file extended with the property layer. This module works for all the languages within the OpeNER project (en,de,nl,fr,es,it) and the language is read from
|
9
|
+
the input KAF file, from the lang attribute of the KAF element (make suresure your preprocessors set properly this value or you might use the resources for a wrong language)
|
10
|
+
````shell
|
11
|
+
<KAF xml:lang="( en | nl | fr | it | de | es)">
|
12
|
+
````
|
13
|
+
|
14
|
+
#Requirements#
|
15
|
+
* VUKafParserPy: parser in python for KAF files (https://github.com/opener-project/VU-kaf-parser)
|
16
|
+
* lxml: library for processing xml in python
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
#How to run the module with Python#
|
21
|
+
You can run this module from the command line using Python. The main script is core/hotel_property_tagger_nl_en.py. This reads
|
22
|
+
and writes the output to the standard output, generating some log information in the standard error output. To process the file
|
23
|
+
"input.kaf" just type:
|
24
|
+
````shell
|
25
|
+
cat input.kaf | core/hotel_property_tagger_nl_en.py > output.kaf
|
26
|
+
````
|
27
|
+
|
28
|
+
This will read the KAF file in "input.kaf" and will store the constituent trees in "input.tree".
|
29
|
+
|
30
|
+
The options of the program can be obtained calling to the program with the parameter -h:
|
31
|
+
````shell
|
32
|
+
core/hotel_property_tagger_nl_en.py -h
|
33
|
+
usage: hotel_property_tagger_nl_en.py [-h] [--no-time] [--lexicon LEXICON]
|
34
|
+
|
35
|
+
Tags a text with polarities at lemma level
|
36
|
+
|
37
|
+
optional arguments:
|
38
|
+
-h, --help show this help message and exit
|
39
|
+
--no-time For not including timestamp in header
|
40
|
+
--lexicon LEXICON Force to use this lexicon
|
41
|
+
````
|
42
|
+
|
43
|
+
The most important option is the --lexicon, which allows you to use your own lexicon with the program. The lexicon must be stored in a file and follow this format:
|
44
|
+
````shell
|
45
|
+
surf verb facilities
|
46
|
+
surfer noun facilities
|
47
|
+
surfing verb facilities
|
48
|
+
```
|
49
|
+
|
50
|
+
So, one aspect per line, with 3 fields separated by a tabulator, the first one is the word or span of words (in this case use whitespaces), then the part
|
51
|
+
of speech (which actually it is not use, you can include a dummy label) and finally the aspect class associated with the word. If you have created your lexicon
|
52
|
+
in one file you could call to the program in this fashion:
|
53
|
+
````shell
|
54
|
+
$ cat my_input.kaf | python core/hotel_property_tagger_nl_en.py --lexicon path/to/your/lexicon/my_lexicon.txt > my_output.kaf
|
55
|
+
````
|
56
|
+
|
57
|
+
Contact
|
58
|
+
------
|
59
|
+
* Ruben Izquierdo
|
60
|
+
* Vrije University of Amsterdam
|
61
|
+
* ruben.izquierdobevia@vu.nl
|
62
|
+
|
data/bin/property-tagger
ADDED
data/config.ru
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
from lxml import etree
|
4
|
+
import sys
|
5
|
+
#filename='/Users/ruben/CODE/VU-sentiment-lexicon-xml/VUSentimentLexicon/EN-lexicon/Sentiment-English-HotelDomain.xml'
|
6
|
+
|
7
|
+
root = etree.parse(sys.stdin).getroot()
|
8
|
+
|
9
|
+
for element in root.findall('Lexicon/LexicalEntry'):
|
10
|
+
ele_lemma = element.findall('Lemma')[0]
|
11
|
+
ele_domain = element.findall('Sense/Domain')[0]
|
12
|
+
pos = element.get('partOfSpeech','unknown_pos')
|
13
|
+
if ele_lemma is not None and ele_domain is not None:
|
14
|
+
lemma = ele_lemma.get('writtenForm','').lower()
|
15
|
+
aspect = ele_domain.get('aspect','').lower()
|
16
|
+
if lemma!='' and aspect!='':
|
17
|
+
print lemma.encode('utf-8')+'\t'+pos.encode('utf-8')+'\t'+aspect.encode('utf-8')
|
18
|
+
|
@@ -0,0 +1,133 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import sys
|
4
|
+
import argparse
|
5
|
+
|
6
|
+
import codecs
|
7
|
+
import os
|
8
|
+
|
9
|
+
this_folder = os.path.dirname(os.path.realpath(__file__))
|
10
|
+
|
11
|
+
# This updates the load path to ensure that the local site-packages directory
|
12
|
+
# can be used to load packages (e.g. a locally installed copy of lxml).
|
13
|
+
sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
|
14
|
+
|
15
|
+
from VUKafParserPy import KafParser
|
16
|
+
from lxml import etree
|
17
|
+
from collections import defaultdict
|
18
|
+
|
19
|
+
__desc='VUA property tagger'
|
20
|
+
__last_edited='4nov2013'
|
21
|
+
__version='1.0'
|
22
|
+
|
23
|
+
###
|
24
|
+
__module_dir = os.path.dirname(__file__)
|
25
|
+
max_ngram = 1
|
26
|
+
verbose = False
|
27
|
+
##
|
28
|
+
|
29
|
+
|
30
|
+
########################################
|
31
|
+
## Format of the file:
|
32
|
+
#lemma pos aspect
|
33
|
+
#lemma pos aspect
|
34
|
+
########################################
|
35
|
+
def loadAspects(my_lang,this_file=None):
|
36
|
+
my_aspects = {}
|
37
|
+
if this_file is not None:
|
38
|
+
aspects_filename = this_file
|
39
|
+
else:
|
40
|
+
aspects_filename = os.path.join(__module_dir,'data',my_lang,'aspects.txt')
|
41
|
+
|
42
|
+
if not os.path.exists(aspects_filename):
|
43
|
+
print>>sys.stderr,'ERROR: file with aspects for the language',my_lang,'not found in',aspects_filename
|
44
|
+
else:
|
45
|
+
fic = codecs.open(aspects_filename,'r','utf-8')
|
46
|
+
for line in fic:
|
47
|
+
fields = line.strip().split('\t')
|
48
|
+
lemma,pos,aspect = fields
|
49
|
+
my_aspects[lemma] = aspect
|
50
|
+
fic.close()
|
51
|
+
return aspects_filename, my_aspects
|
52
|
+
########################################
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
###### MAIN ########
|
57
|
+
|
58
|
+
argument_parser = argparse.ArgumentParser(description='Tags a text with polarities at lemma level')
|
59
|
+
argument_parser.add_argument("--no-time",action="store_false", default=True, dest="my_time_stamp",help="For not including timestamp in header")
|
60
|
+
argument_parser.add_argument("--lexicon", action="store", default=None, dest="lexicon", help="Force to use this lexicon")
|
61
|
+
|
62
|
+
arguments = argument_parser.parse_args()
|
63
|
+
|
64
|
+
if not sys.stdin.isatty():
|
65
|
+
## READING FROM A PIPE
|
66
|
+
pass
|
67
|
+
else:
|
68
|
+
print>>sys.stderr,'Input stream required.'
|
69
|
+
print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
|
70
|
+
print>>sys.stderr,sys.argv[0]+' -h for help'
|
71
|
+
sys.exit(-1)
|
72
|
+
|
73
|
+
|
74
|
+
## Load the tree and the list of terms with the id
|
75
|
+
my_data = []
|
76
|
+
try:
|
77
|
+
my_kaf_tree = KafParser(sys.stdin)
|
78
|
+
except Exception as e:
|
79
|
+
print>>sys.stdout,'Error parsing input. Input is required to be KAF'
|
80
|
+
print>>sys.stdout,str(e)
|
81
|
+
sys.exit(2)
|
82
|
+
|
83
|
+
|
84
|
+
## Get language from the KAF file
|
85
|
+
my_lang = my_kaf_tree.getLanguage()
|
86
|
+
|
87
|
+
my_aspects_filename = my_aspects = None
|
88
|
+
if arguments.lexicon is None:
|
89
|
+
if my_lang not in ['nl','en','de','fr','it','es']:
|
90
|
+
print>>sys.stdout,'Error in the language specified in your KAF. The language is ',my_lang,' and possible values for this module '
|
91
|
+
print>>sys.stdout,'are nl for Dutch ,en for English, es Spanish, fr French, it Italian or de German'
|
92
|
+
sys.exit(1)
|
93
|
+
|
94
|
+
my_aspects_filename, my_aspects = loadAspects(my_lang)
|
95
|
+
else:
|
96
|
+
my_aspects_filename, my_aspects = loadAspects(my_lang,this_file=arguments.lexicon)
|
97
|
+
|
98
|
+
if verbose:
|
99
|
+
print>>sys.stderr,'Loaded ',len(my_aspects),'aspects from',my_aspects_filename
|
100
|
+
|
101
|
+
|
102
|
+
for term in my_kaf_tree.getTerms():
|
103
|
+
my_data.append((term.getLemma(),term.getId()))
|
104
|
+
if verbose: print>>sys.stderr,'Number of terms in the kaf file:',len(my_data)
|
105
|
+
|
106
|
+
|
107
|
+
current_token = found = 0
|
108
|
+
uniq_aspects = defaultdict(list)
|
109
|
+
while current_token < len(my_data):
|
110
|
+
for tam_ngram in range(1,max_ngram+1):
|
111
|
+
# Build an n-gram of size tam_ngram and beginning in current_token
|
112
|
+
if current_token + tam_ngram <= len(my_data):
|
113
|
+
ngram = ' '.join(lemma for lemma,_ in my_data[current_token:current_token+tam_ngram])
|
114
|
+
aspect = my_aspects.get(ngram,None)
|
115
|
+
if aspect is not None:
|
116
|
+
list_of_ids = [id for _,id in my_data[current_token:current_token+tam_ngram]]
|
117
|
+
uniq_aspects[aspect].append((list_of_ids,ngram))
|
118
|
+
current_token += 1
|
119
|
+
|
120
|
+
|
121
|
+
## Code for generating the propery layer included in the Parser
|
122
|
+
for aspect, list_of_lists in uniq_aspects.items():
|
123
|
+
for list_of_ids, str_text in list_of_lists:
|
124
|
+
my_kaf_tree.add_property(aspect,list_of_ids,str_text)
|
125
|
+
|
126
|
+
my_kaf_tree.addLinguisticProcessor(__desc,__last_edited+'_'+__version,'features', arguments.my_time_stamp)
|
127
|
+
my_kaf_tree.saveToFile(sys.stdout)
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
../VUKafParserPy/KafParserMod.py
|
2
|
+
../VUKafParserPy/__init__.py
|
3
|
+
../VUKafParserPy/KafDataObjectsMod.py
|
4
|
+
../VUKafParserPy/KafParserMod.pyc
|
5
|
+
../VUKafParserPy/__init__.pyc
|
6
|
+
../VUKafParserPy/KafDataObjectsMod.pyc
|
7
|
+
./
|
8
|
+
top_level.txt
|
9
|
+
SOURCES.txt
|
10
|
+
PKG-INFO
|
11
|
+
dependency_links.txt
|
@@ -0,0 +1 @@
|
|
1
|
+
VUKafParserPy
|
@@ -0,0 +1,165 @@
|
|
1
|
+
class KafTermSentiment:
|
2
|
+
def __init__(self):
|
3
|
+
self.resource=None
|
4
|
+
self.polarity=None
|
5
|
+
self.strength=None
|
6
|
+
self.subjectivity=None
|
7
|
+
|
8
|
+
def simpleInit(self,r,p,st,su,sm=None):
|
9
|
+
self.resource=r
|
10
|
+
self.polarity=p
|
11
|
+
self.strength=st
|
12
|
+
self.subjectivity=su
|
13
|
+
self.sentiment_modifier = sm
|
14
|
+
|
15
|
+
def getPolarity(self):
|
16
|
+
return self.polarity
|
17
|
+
|
18
|
+
def getSentimentModifier(self):
|
19
|
+
return self.sentiment_modifier
|
20
|
+
|
21
|
+
|
22
|
+
class KafToken:
|
23
|
+
def __init__(self,wid, value, sent=None, para=None):
|
24
|
+
self.token_id = wid
|
25
|
+
self.value = value
|
26
|
+
self.sent = sent
|
27
|
+
self.para = para
|
28
|
+
|
29
|
+
|
30
|
+
class KafOpinionExpression:
|
31
|
+
def __init__(self,polarity,strength,targets):
|
32
|
+
self.polarity = polarity
|
33
|
+
self.strength = strength
|
34
|
+
self.targets = targets
|
35
|
+
|
36
|
+
def __str__(self):
|
37
|
+
return 'Op_exp==> pol:'+self.polarity+' Str:'+self.strength+' ids:'+'-'.join(self.targets)
|
38
|
+
|
39
|
+
class KafOpinion:
|
40
|
+
def __init__(self,id,holders, targets, opi_exp):
|
41
|
+
self.id = id
|
42
|
+
self.holders = holders
|
43
|
+
self.targets = targets
|
44
|
+
self.opi_exp = opi_exp
|
45
|
+
|
46
|
+
def __str__(self):
|
47
|
+
c='Opinion id'+self.id+'\n'
|
48
|
+
c+=' Holders: '+'-'.join(self.holders)+'\n'
|
49
|
+
c+=' Targets: '+'-'.join(self.targets)+'\n'
|
50
|
+
c+=str(self.opi_exp)
|
51
|
+
return c
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
class KafSingleProperty:
|
56
|
+
def __init__(self,id,type,targets):
|
57
|
+
self.id = id
|
58
|
+
self.type = type
|
59
|
+
self.targets = targets
|
60
|
+
|
61
|
+
|
62
|
+
def get_id(self):
|
63
|
+
return self.id
|
64
|
+
|
65
|
+
def get_type(self):
|
66
|
+
return self.type
|
67
|
+
|
68
|
+
def get_span(self):
|
69
|
+
return self.targets
|
70
|
+
|
71
|
+
def __str__(self):
|
72
|
+
return 'Id: '+self.id+' Type: '+self.type+' ids:'+' '.join(self.targets)
|
73
|
+
|
74
|
+
|
75
|
+
class KafSingleEntity:
|
76
|
+
def __init__(self,id,type,targets):
|
77
|
+
self.id = id
|
78
|
+
self.type = type
|
79
|
+
self.targets = targets
|
80
|
+
|
81
|
+
def get_id(self):
|
82
|
+
return self.id
|
83
|
+
|
84
|
+
def get_type(self):
|
85
|
+
return self.type
|
86
|
+
|
87
|
+
def get_span(self):
|
88
|
+
return self.targets
|
89
|
+
|
90
|
+
def __str__(self):
|
91
|
+
return 'Id: '+self.id+' Type: '+self.type+' ids:'+' '.join(self.targets)
|
92
|
+
|
93
|
+
class KafTerm:
|
94
|
+
def __init__(self):
|
95
|
+
self.tid = None
|
96
|
+
self.lemma = None
|
97
|
+
self.pos = None
|
98
|
+
self.morphofeat = None
|
99
|
+
self.sentiment = None
|
100
|
+
self.list_span_id = []
|
101
|
+
|
102
|
+
def get_morphofeat(self):
|
103
|
+
return self.morphofeat
|
104
|
+
|
105
|
+
def set_list_span_id(self, L):
|
106
|
+
self.list_span_id = L
|
107
|
+
|
108
|
+
def get_list_span(self):
|
109
|
+
return self.list_span_id
|
110
|
+
|
111
|
+
def get_polarity(self):
|
112
|
+
if self.sentiment != None:
|
113
|
+
return self.sentiment.getPolarity()
|
114
|
+
else:
|
115
|
+
return None
|
116
|
+
|
117
|
+
def get_sentiment_modifier(self):
|
118
|
+
if self.sentiment != None:
|
119
|
+
return self.sentiment.getSentimentModifier()
|
120
|
+
else:
|
121
|
+
return None
|
122
|
+
|
123
|
+
|
124
|
+
def setSentiment(self,my_sent):
|
125
|
+
self.sentiment = my_sent
|
126
|
+
|
127
|
+
def getSentiment(self):
|
128
|
+
return self.sentiment
|
129
|
+
|
130
|
+
def getLemma(self):
|
131
|
+
return self.lemma
|
132
|
+
|
133
|
+
def setLemma(self,lemma):
|
134
|
+
self.lemma = lemma
|
135
|
+
|
136
|
+
def getPos(self):
|
137
|
+
return self.pos
|
138
|
+
|
139
|
+
def setPos(self,pos):
|
140
|
+
self.pos = pos
|
141
|
+
|
142
|
+
def getId(self):
|
143
|
+
return self.tid
|
144
|
+
|
145
|
+
def setId(self,id):
|
146
|
+
self.tid = id
|
147
|
+
|
148
|
+
def getShortPos(self):
|
149
|
+
if self.pos==None:
|
150
|
+
return None
|
151
|
+
auxpos=self.pos.lower()[0]
|
152
|
+
if auxpos == 'g': auxpos='a'
|
153
|
+
elif auxpos == 'a': auxpos='r'
|
154
|
+
return auxpos
|
155
|
+
|
156
|
+
def __str__(self):
|
157
|
+
if self.tid and self.lemma and self.pos:
|
158
|
+
return self.tid+'\n\t'+self.lemma.encode('utf-8')+'\n\t'+self.pos
|
159
|
+
else:
|
160
|
+
return 'None'
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
|
165
|
+
|