opener-property-tagger 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 449add6937b48b2b9c17a8174cc5790a2d366088
4
+ data.tar.gz: b64fc05e7b01f9720969607637e94e891a57c614
5
+ SHA512:
6
+ metadata.gz: 24d9e2caf35674b6e33e495ba84467198b6adbfcfe3dab50e586e6ec6467e61b24e73810164a426ef7d4c7fdbecbea90255d0bc0caeff09585015505ae13d16f
7
+ data.tar.gz: 7969b6cb5409ab4c6141e68ef708ac85fa9fa8169c476c696fe40bc5c6452dfc3cdd590cce40bf306370d1a7f7f1901f19206d53cef63015eaa79fedfea31080
data/README.md ADDED
@@ -0,0 +1,62 @@
1
+ #property-tagger#
2
+
3
+ #Introduction#
4
+
5
+ This module implements a tagger for hotel properties for Dutch, English, French, Italian, Spanish and German. It detects aspect words,
6
+ for instance words related with "room", "cleanliness", "staff" or "breakfast" and links them with the correct aspect class.
7
+ The input for this module has to be a valid KAF file with at lest the term layer, as the lemmas will be used for detecting the hotel properties. The output is also
8
+ a KAF valid file extended with the property layer. This module works for all the languages within the OpeNER project (en,de,nl,fr,es,it) and the language is read from
9
+ the input KAF file, from the lang attribute of the KAF element (make suresure your preprocessors set properly this value or you might use the resources for a wrong language)
10
+ ````shell
11
+ <KAF xml:lang="( en | nl | fr | it | de | es)">
12
+ ````
13
+
14
+ #Requirements#
15
+ * VUKafParserPy: parser in python for KAF files (https://github.com/opener-project/VU-kaf-parser)
16
+ * lxml: library for processing xml in python
17
+
18
+
19
+
20
+ #How to run the module with Python#
21
+ You can run this module from the command line using Python. The main script is core/hotel_property_tagger_nl_en.py. This reads
22
+ and writes the output to the standard output, generating some log information in the standard error output. To process the file
23
+ "input.kaf" just type:
24
+ ````shell
25
+ cat input.kaf | core/hotel_property_tagger_nl_en.py > output.kaf
26
+ ````
27
+
28
+ This will read the KAF file in "input.kaf" and will store the constituent trees in "input.tree".
29
+
30
+ The options of the program can be obtained calling to the program with the parameter -h:
31
+ ````shell
32
+ core/hotel_property_tagger_nl_en.py -h
33
+ usage: hotel_property_tagger_nl_en.py [-h] [--no-time] [--lexicon LEXICON]
34
+
35
+ Tags a text with polarities at lemma level
36
+
37
+ optional arguments:
38
+ -h, --help show this help message and exit
39
+ --no-time For not including timestamp in header
40
+ --lexicon LEXICON Force to use this lexicon
41
+ ````
42
+
43
+ The most important option is the --lexicon, which allows you to use your own lexicon with the program. The lexicon must be stored in a file and follow this format:
44
+ ````shell
45
+ surf verb facilities
46
+ surfer noun facilities
47
+ surfing verb facilities
48
+ ```
49
+
50
+ So, one aspect per line, with 3 fields separated by a tabulator, the first one is the word or span of words (in this case use whitespaces), then the part
51
+ of speech (which actually it is not use, you can include a dummy label) and finally the aspect class associated with the word. If you have created your lexicon
52
+ in one file you could call to the program in this fashion:
53
+ ````shell
54
+ $ cat my_input.kaf | python core/hotel_property_tagger_nl_en.py --lexicon path/to/your/lexicon/my_lexicon.txt > my_output.kaf
55
+ ````
56
+
57
+ Contact
58
+ ------
59
+ * Ruben Izquierdo
60
+ * Vrije University of Amsterdam
61
+ * ruben.izquierdobevia@vu.nl
62
+
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/property_tagger'
4
+
5
+ cli = Opener::PropertyTagger::CLI.new(:args => ARGV)
6
+
7
+ cli.run(STDIN.tty? ? nil : STDIN.read)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'puma/cli'
4
+
5
+ rack_config = File.expand_path('../../config.ru', __FILE__)
6
+
7
+ cli = Puma::CLI.new([rack_config] + ARGV)
8
+ cli.run
data/config.ru ADDED
@@ -0,0 +1,5 @@
1
+ require File.expand_path('../lib/opener/property_tagger/hotel', __FILE__)
2
+ require File.expand_path('../lib/opener/property_tagger/server', __FILE__)
3
+
4
+ run Opener::PropertyTagger::Server
5
+
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env python
2
+
3
+ from lxml import etree
4
+ import sys
5
+ #filename='/Users/ruben/CODE/VU-sentiment-lexicon-xml/VUSentimentLexicon/EN-lexicon/Sentiment-English-HotelDomain.xml'
6
+
7
+ root = etree.parse(sys.stdin).getroot()
8
+
9
+ for element in root.findall('Lexicon/LexicalEntry'):
10
+ ele_lemma = element.findall('Lemma')[0]
11
+ ele_domain = element.findall('Sense/Domain')[0]
12
+ pos = element.get('partOfSpeech','unknown_pos')
13
+ if ele_lemma is not None and ele_domain is not None:
14
+ lemma = ele_lemma.get('writtenForm','').lower()
15
+ aspect = ele_domain.get('aspect','').lower()
16
+ if lemma!='' and aspect!='':
17
+ print lemma.encode('utf-8')+'\t'+pos.encode('utf-8')+'\t'+aspect.encode('utf-8')
18
+
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import argparse
5
+
6
+ import codecs
7
+ import os
8
+
9
+ this_folder = os.path.dirname(os.path.realpath(__file__))
10
+
11
+ # This updates the load path to ensure that the local site-packages directory
12
+ # can be used to load packages (e.g. a locally installed copy of lxml).
13
+ sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
14
+
15
+ from VUKafParserPy import KafParser
16
+ from lxml import etree
17
+ from collections import defaultdict
18
+
19
+ __desc='VUA property tagger'
20
+ __last_edited='4nov2013'
21
+ __version='1.0'
22
+
23
+ ###
24
+ __module_dir = os.path.dirname(__file__)
25
+ max_ngram = 1
26
+ verbose = False
27
+ ##
28
+
29
+
30
+ ########################################
31
+ ## Format of the file:
32
+ #lemma pos aspect
33
+ #lemma pos aspect
34
+ ########################################
35
+ def loadAspects(my_lang,this_file=None):
36
+ my_aspects = {}
37
+ if this_file is not None:
38
+ aspects_filename = this_file
39
+ else:
40
+ aspects_filename = os.path.join(__module_dir,'data',my_lang,'aspects.txt')
41
+
42
+ if not os.path.exists(aspects_filename):
43
+ print>>sys.stderr,'ERROR: file with aspects for the language',my_lang,'not found in',aspects_filename
44
+ else:
45
+ fic = codecs.open(aspects_filename,'r','utf-8')
46
+ for line in fic:
47
+ fields = line.strip().split('\t')
48
+ lemma,pos,aspect = fields
49
+ my_aspects[lemma] = aspect
50
+ fic.close()
51
+ return aspects_filename, my_aspects
52
+ ########################################
53
+
54
+
55
+
56
+ ###### MAIN ########
57
+
58
+ argument_parser = argparse.ArgumentParser(description='Tags a text with polarities at lemma level')
59
+ argument_parser.add_argument("--no-time",action="store_false", default=True, dest="my_time_stamp",help="For not including timestamp in header")
60
+ argument_parser.add_argument("--lexicon", action="store", default=None, dest="lexicon", help="Force to use this lexicon")
61
+
62
+ arguments = argument_parser.parse_args()
63
+
64
+ if not sys.stdin.isatty():
65
+ ## READING FROM A PIPE
66
+ pass
67
+ else:
68
+ print>>sys.stderr,'Input stream required.'
69
+ print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
70
+ print>>sys.stderr,sys.argv[0]+' -h for help'
71
+ sys.exit(-1)
72
+
73
+
74
+ ## Load the tree and the list of terms with the id
75
+ my_data = []
76
+ try:
77
+ my_kaf_tree = KafParser(sys.stdin)
78
+ except Exception as e:
79
+ print>>sys.stdout,'Error parsing input. Input is required to be KAF'
80
+ print>>sys.stdout,str(e)
81
+ sys.exit(2)
82
+
83
+
84
+ ## Get language from the KAF file
85
+ my_lang = my_kaf_tree.getLanguage()
86
+
87
+ my_aspects_filename = my_aspects = None
88
+ if arguments.lexicon is None:
89
+ if my_lang not in ['nl','en','de','fr','it','es']:
90
+ print>>sys.stdout,'Error in the language specified in your KAF. The language is ',my_lang,' and possible values for this module '
91
+ print>>sys.stdout,'are nl for Dutch ,en for English, es Spanish, fr French, it Italian or de German'
92
+ sys.exit(1)
93
+
94
+ my_aspects_filename, my_aspects = loadAspects(my_lang)
95
+ else:
96
+ my_aspects_filename, my_aspects = loadAspects(my_lang,this_file=arguments.lexicon)
97
+
98
+ if verbose:
99
+ print>>sys.stderr,'Loaded ',len(my_aspects),'aspects from',my_aspects_filename
100
+
101
+
102
+ for term in my_kaf_tree.getTerms():
103
+ my_data.append((term.getLemma(),term.getId()))
104
+ if verbose: print>>sys.stderr,'Number of terms in the kaf file:',len(my_data)
105
+
106
+
107
+ current_token = found = 0
108
+ uniq_aspects = defaultdict(list)
109
+ while current_token < len(my_data):
110
+ for tam_ngram in range(1,max_ngram+1):
111
+ # Build an n-gram of size tam_ngram and beginning in current_token
112
+ if current_token + tam_ngram <= len(my_data):
113
+ ngram = ' '.join(lemma for lemma,_ in my_data[current_token:current_token+tam_ngram])
114
+ aspect = my_aspects.get(ngram,None)
115
+ if aspect is not None:
116
+ list_of_ids = [id for _,id in my_data[current_token:current_token+tam_ngram]]
117
+ uniq_aspects[aspect].append((list_of_ids,ngram))
118
+ current_token += 1
119
+
120
+
121
+ ## Code for generating the propery layer included in the Parser
122
+ for aspect, list_of_lists in uniq_aspects.items():
123
+ for list_of_ids, str_text in list_of_lists:
124
+ my_kaf_tree.add_property(aspect,list_of_ids,str_text)
125
+
126
+ my_kaf_tree.addLinguisticProcessor(__desc,__last_edited+'_'+__version,'features', arguments.my_time_stamp)
127
+ my_kaf_tree.saveToFile(sys.stdout)
128
+
129
+
130
+
131
+
132
+
133
+
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 1.0
2
+ Name: VUKafParserPy
3
+ Version: 1.0
4
+ Summary: Library in python to parse kaf files
5
+ Home-page: UNKNOWN
6
+ Author: Ruben Izquierdo
7
+ Author-email: r.izquierdobevia@vu.nl
8
+ License: UNKNOWN
9
+ Description: UNKNOWN
10
+ Platform: UNKNOWN
@@ -0,0 +1,7 @@
1
+ VUKafParserPy/KafDataObjectsMod.py
2
+ VUKafParserPy/KafParserMod.py
3
+ VUKafParserPy/__init__.py
4
+ VUKafParserPy.egg-info/PKG-INFO
5
+ VUKafParserPy.egg-info/SOURCES.txt
6
+ VUKafParserPy.egg-info/dependency_links.txt
7
+ VUKafParserPy.egg-info/top_level.txt
@@ -0,0 +1,11 @@
1
+ ../VUKafParserPy/KafParserMod.py
2
+ ../VUKafParserPy/__init__.py
3
+ ../VUKafParserPy/KafDataObjectsMod.py
4
+ ../VUKafParserPy/KafParserMod.pyc
5
+ ../VUKafParserPy/__init__.pyc
6
+ ../VUKafParserPy/KafDataObjectsMod.pyc
7
+ ./
8
+ top_level.txt
9
+ SOURCES.txt
10
+ PKG-INFO
11
+ dependency_links.txt
@@ -0,0 +1,165 @@
1
+ class KafTermSentiment:
2
+ def __init__(self):
3
+ self.resource=None
4
+ self.polarity=None
5
+ self.strength=None
6
+ self.subjectivity=None
7
+
8
+ def simpleInit(self,r,p,st,su,sm=None):
9
+ self.resource=r
10
+ self.polarity=p
11
+ self.strength=st
12
+ self.subjectivity=su
13
+ self.sentiment_modifier = sm
14
+
15
+ def getPolarity(self):
16
+ return self.polarity
17
+
18
+ def getSentimentModifier(self):
19
+ return self.sentiment_modifier
20
+
21
+
22
+ class KafToken:
23
+ def __init__(self,wid, value, sent=None, para=None):
24
+ self.token_id = wid
25
+ self.value = value
26
+ self.sent = sent
27
+ self.para = para
28
+
29
+
30
+ class KafOpinionExpression:
31
+ def __init__(self,polarity,strength,targets):
32
+ self.polarity = polarity
33
+ self.strength = strength
34
+ self.targets = targets
35
+
36
+ def __str__(self):
37
+ return 'Op_exp==> pol:'+self.polarity+' Str:'+self.strength+' ids:'+'-'.join(self.targets)
38
+
39
+ class KafOpinion:
40
+ def __init__(self,id,holders, targets, opi_exp):
41
+ self.id = id
42
+ self.holders = holders
43
+ self.targets = targets
44
+ self.opi_exp = opi_exp
45
+
46
+ def __str__(self):
47
+ c='Opinion id'+self.id+'\n'
48
+ c+=' Holders: '+'-'.join(self.holders)+'\n'
49
+ c+=' Targets: '+'-'.join(self.targets)+'\n'
50
+ c+=str(self.opi_exp)
51
+ return c
52
+
53
+
54
+
55
+ class KafSingleProperty:
56
+ def __init__(self,id,type,targets):
57
+ self.id = id
58
+ self.type = type
59
+ self.targets = targets
60
+
61
+
62
+ def get_id(self):
63
+ return self.id
64
+
65
+ def get_type(self):
66
+ return self.type
67
+
68
+ def get_span(self):
69
+ return self.targets
70
+
71
+ def __str__(self):
72
+ return 'Id: '+self.id+' Type: '+self.type+' ids:'+' '.join(self.targets)
73
+
74
+
75
+ class KafSingleEntity:
76
+ def __init__(self,id,type,targets):
77
+ self.id = id
78
+ self.type = type
79
+ self.targets = targets
80
+
81
+ def get_id(self):
82
+ return self.id
83
+
84
+ def get_type(self):
85
+ return self.type
86
+
87
+ def get_span(self):
88
+ return self.targets
89
+
90
+ def __str__(self):
91
+ return 'Id: '+self.id+' Type: '+self.type+' ids:'+' '.join(self.targets)
92
+
93
+ class KafTerm:
94
+ def __init__(self):
95
+ self.tid = None
96
+ self.lemma = None
97
+ self.pos = None
98
+ self.morphofeat = None
99
+ self.sentiment = None
100
+ self.list_span_id = []
101
+
102
+ def get_morphofeat(self):
103
+ return self.morphofeat
104
+
105
+ def set_list_span_id(self, L):
106
+ self.list_span_id = L
107
+
108
+ def get_list_span(self):
109
+ return self.list_span_id
110
+
111
+ def get_polarity(self):
112
+ if self.sentiment != None:
113
+ return self.sentiment.getPolarity()
114
+ else:
115
+ return None
116
+
117
+ def get_sentiment_modifier(self):
118
+ if self.sentiment != None:
119
+ return self.sentiment.getSentimentModifier()
120
+ else:
121
+ return None
122
+
123
+
124
+ def setSentiment(self,my_sent):
125
+ self.sentiment = my_sent
126
+
127
+ def getSentiment(self):
128
+ return self.sentiment
129
+
130
+ def getLemma(self):
131
+ return self.lemma
132
+
133
+ def setLemma(self,lemma):
134
+ self.lemma = lemma
135
+
136
+ def getPos(self):
137
+ return self.pos
138
+
139
+ def setPos(self,pos):
140
+ self.pos = pos
141
+
142
+ def getId(self):
143
+ return self.tid
144
+
145
+ def setId(self,id):
146
+ self.tid = id
147
+
148
+ def getShortPos(self):
149
+ if self.pos==None:
150
+ return None
151
+ auxpos=self.pos.lower()[0]
152
+ if auxpos == 'g': auxpos='a'
153
+ elif auxpos == 'a': auxpos='r'
154
+ return auxpos
155
+
156
+ def __str__(self):
157
+ if self.tid and self.lemma and self.pos:
158
+ return self.tid+'\n\t'+self.lemma.encode('utf-8')+'\n\t'+self.pos
159
+ else:
160
+ return 'None'
161
+
162
+
163
+
164
+
165
+