opener-tree-tagger 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +150 -0
  3. data/bin/opener-tree-tagger-daemon +7 -0
  4. data/bin/opener-tree-tagger-server +11 -0
  5. data/bin/tree-tagger +7 -0
  6. data/config.ru +5 -0
  7. data/core/dutch.map.treetagger.kaf.csv +40 -0
  8. data/core/english.map.treetagger.kaf.csv +36 -0
  9. data/core/french.map.treetagger.kaf.csv +33 -0
  10. data/core/german.map.treetagger.kaf.csv +52 -0
  11. data/core/italian.map.treetagger.kaf.csv +38 -0
  12. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  13. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  14. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  15. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  16. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  17. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  18. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  19. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  20. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  21. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  22. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  23. data/core/spanish.map.treetagger.kaf.csv +75 -0
  24. data/core/token_matcher.py +82 -0
  25. data/core/tt_from_kaf_to_kaf.py +215 -0
  26. data/exec/tree-tagger.rb +9 -0
  27. data/ext/hack/Rakefile +13 -0
  28. data/ext/hack/support.rb +38 -0
  29. data/lib/opener/tree_tagger.rb +69 -0
  30. data/lib/opener/tree_tagger/cli.rb +69 -0
  31. data/lib/opener/tree_tagger/public/markdown.css +284 -0
  32. data/lib/opener/tree_tagger/server.rb +16 -0
  33. data/lib/opener/tree_tagger/version.rb +5 -0
  34. data/lib/opener/tree_tagger/views/index.erb +96 -0
  35. data/lib/opener/tree_tagger/views/result.erb +15 -0
  36. data/opener-tree-tagger.gemspec +35 -0
  37. data/pre_build_requirements.txt +1 -0
  38. metadata +197 -0
@@ -0,0 +1,75 @@
1
+ ACRNM O acronym (ISO, CEI)
2
+ ADJ G Adjectives (mayores, mayor)
3
+ ADV A Adverbs (muy, demasiado, c�mo)
4
+ ALFP O Plural letter of the alphabet (As/Aes, bes)
5
+ ALFS O Singular letter of the alphabet (A, b)
6
+ ART D Articles (un, las, la, unas)
7
+ BACKSLASH O backslash (\)
8
+ CARD O Cardinals
9
+ CC C Coordinating conjunction (y, o)
10
+ CCAD C Adversative coordinating conjunction (pero)
11
+ CCNEG C Negative coordinating conjunction (ni)
12
+ CM O comma (,)
13
+ CODE O Alphanumeric code
14
+ COLON O colon (:)
15
+ CQUE C que (as conjunction)
16
+ CSUBF C Subordinating conjunction that introduces finite clauses (apenas)
17
+ CSUBI C Subordinating conjunction that introduces infinite clauses (al)
18
+ CSUBX C Subordinating conjunction underspecified for subord-type (aunque)
19
+ DASH O dash (-)
20
+ DM Q Demonstrative pronouns (�sas, �se, esta)
21
+ DOTS O POS tag for "..."
22
+ FO O Formula
23
+ FS O Full stop punctuation marks
24
+ INT Q Interrogative pronouns (qui�nes, cu�ntas, cu�nto)
25
+ ITJN O Interjection (oh, ja)
26
+ LP O left parenthesis ("(", "[")
27
+ NC N Common nouns (mesas, mesa, libro, ordenador)
28
+ NEG O Negation
29
+ NMEA N measure noun (metros, litros)
30
+ NMON N month name
31
+ NP R Proper nouns
32
+ ORD O Ordinals (primer, primeras, primera)
33
+ PAL O Portmanteau word formed by a and el
34
+ PDEL O Portmanteau word formed by de and el
35
+ PE O Foreign word
36
+ PERCT O percent sign (%)
37
+ PNC O Unclassified word
38
+ PPC Q Clitic personal pronoun (le, les)
39
+ PPO Q Possessive pronouns (mi, su, sus)
40
+ PPX Q Clitics and personal pronouns (nos, me, nosotras, te, s�)
41
+ PREP O Negative preposition (sin)
42
+ PREP O Preposition
43
+ PREP/DEL O Complex preposition "despu�s del"
44
+ QT O quotation symbol (" ' `)
45
+ QU O Quantifiers (sendas, cada)
46
+ REL Q Relative pronouns (cuyas, cuyo)
47
+ RP O right parenthesis (")", "]")
48
+ SE O Se (as particle)
49
+ SEMICOLON O semicolon (;)
50
+ SLASH O slash (/)
51
+ SYM O Symbols
52
+ UMMX N measure unit (MHz, km, mA)
53
+ VCLIger V clitic gerund verb
54
+ VCLIinf V clitic infinitive verb
55
+ VCLIfin V clitic finite verb
56
+ VEadj V Verb estar. Past participle
57
+ VEfin V Verb estar. Finite
58
+ VEger V Verb estar. Gerund
59
+ VEinf V Verb estar. Infinitive
60
+ VHadj V Verb haber. Past participle
61
+ VHfin V Verb haber. Finite
62
+ VHger V Verb haber. Gerund
63
+ VHinf V Verb haber. Infinitive
64
+ VLadj V Lexical verb. Past participle
65
+ VLfin V Lexical verb. Finite
66
+ VLger V Lexical verb. Gerund
67
+ VLinf V Lexical verb. Infinitive
68
+ VMadj V Modal verb. Past participle
69
+ VMfin V Modal verb. Finite
70
+ VMger V Modal verb. Gerund
71
+ VMinf V Modal verb. Infinitive
72
+ VSadj V Verb ser. Past participle
73
+ VSfin V Verb ser. Finite
74
+ VSger V Verb ser. Gerund
75
+ VSinf V Verb ser. Infinitive
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env python
2
+
3
+
4
+ #####
5
+ # 4-Mar-2013 : modified order of rules to check first if there is a merge and then if it is an extra token
6
+ # becuase of this case, where can be both: [ .. . ] [ . . . ]
7
+
8
+
9
+ def add_match(d,id_new,id_ref):
10
+ if id_new in d:
11
+ d[id_new].append(id_ref)
12
+ else:
13
+ d[id_new]=[id_ref]
14
+
15
+
16
+ def token_matcher(l_ref,l_new,super_d):
17
+ debug = False
18
+ if debug:
19
+ print l_ref
20
+ print l_new
21
+ if len(l_new)==0:
22
+ return
23
+ else:
24
+ token_ref, id_ref = l_ref[0]
25
+ token_new, id_new = l_new[0]
26
+ if token_ref == token_new:
27
+ if debug: print 'Matching ',l_ref[0],l_new[0]
28
+ if debug: print 'A',l_ref[0],l_new[0]
29
+ add_match(super_d,id_new,id_ref)
30
+ token_matcher(l_ref[1:],l_new[1:],super_d)
31
+ else:
32
+ if token_ref.startswith(token_new) : ##There was an split
33
+ if debug: print 'D'
34
+ aux = (token_ref[len(token_new):],id_ref)
35
+ l_ref[0]=aux
36
+
37
+ add_match(super_d,id_new,id_ref)
38
+ token_matcher(l_ref,l_new[1:],super_d)
39
+
40
+ elif token_new.startswith(token_ref) : ##There was a merge
41
+ if debug: print 'E'
42
+ aux = (token_new[len(token_ref)+1:],id_new)
43
+ l_new[0]=aux
44
+ add_match(super_d,id_new,id_ref)
45
+ token_matcher(l_ref[1:],l_new,super_d)
46
+
47
+
48
+ elif len(l_new)>1 and l_new[1][0]==token_ref: ## There is an extra token in l_new
49
+ if debug: print 'B',l_new[1][0],token_ref
50
+ token_matcher(l_ref[0:],l_new[1:],super_d)
51
+
52
+
53
+ elif len(l_ref)>1 and l_ref[1][0] == token_new: ## There is an extra token in l_ref
54
+ if debug: print 'C',l_ref[1:],l_new[0:]
55
+ token_matcher(l_ref[1:],l_new[0:],super_d)
56
+
57
+
58
+ else: ## Imposible matching
59
+ if debug: print 'F'
60
+ if debug: print 'Impossible match of ',l_new[0],l_ref[0]
61
+ token_matcher(l_ref[1:],l_new[1:],super_d)
62
+
63
+
64
+ if __name__ == '__main__':
65
+ l1 = []
66
+ s1 = 'Beatrix Wilhelmina Armgard van Oranje -Nassau (Baarn , 31 januari 1938 ) is sinds 30 april 1980 koningin van het Koninkrijk der Nederlanden'
67
+
68
+ s1 = 'Th is is a very simple example'
69
+ for n,t in enumerate(s1.split(' ')):
70
+ l1.append((t,'id'+str(n)))
71
+
72
+ l2 = []
73
+ #s2 = 'Beatrix Wilhelmina Armgard van Oranje -Nassau ( Baarn , 31 januari 1938 ) is sinds 30 april 1980 koningin van het Koninkrijk der Nederlanden'
74
+ s2 = 'This is a very sim ple example'
75
+ for n,t in enumerate(s2.split(' ')):
76
+ l2.append((t,'id'+str(n)))
77
+
78
+ super_d = {}
79
+ token_matcher(l1,l2,super_d)
80
+ print l1
81
+ print l2
82
+ print super_d
@@ -0,0 +1,215 @@
1
+ #!/usr/bin/env python
2
+ #-*- coding: utf8 *-*
3
+ __version__ = '1.2 4-Mar-2013'
4
+
5
+ ## Last changes
6
+ # 1-Mar-2013 --> now it works with UTF-8 !!!
7
+ # 4-Mar-2013 --> added code for including the element in the linguistic processores header
8
+ # 5-Mar-2013 --> language is not a parameter, is read from the input KAF
9
+ # 9-dec-2013 --> the postagger avoids 2 terms with the same tokenid span, like 's --> ' and s
10
+ # 11-mar-2014 --> fixed problem when merge with token_matcher
11
+ ###################################
12
+
13
+
14
+ import sys
15
+ import os
16
+
17
+ this_folder = os.path.dirname(os.path.realpath(__file__))
18
+
19
+ # This updates the load path to ensure that the local site-packages directory
20
+ # can be used to load packages (e.g. a locally installed copy of lxml).
21
+ sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
22
+
23
+ import operator
24
+ import time
25
+ import getopt
26
+ import string
27
+ import subprocess
28
+ import lxml
29
+ from lxml import etree
30
+ from lxml.etree import ElementTree as ET, Element as EL, PI
31
+ from VUKafParserPy.KafParserMod import KafParser
32
+ from token_matcher import token_matcher
33
+
34
+
35
+
36
+ if not os.environ.get('TREE_TAGGER_PATH'):
37
+ print>>sys.stderr,"TREE_TAGGER_PATH environment variable not found. Please set the full path to your tree tagger in the TREE_TAGGER_PATH environent variable."
38
+ sys.exit(-1)
39
+
40
+ complete_path_to_treetagger = os.environ.get('TREE_TAGGER_PATH')
41
+
42
+
43
+ def loadMapping(mapping_file):
44
+ map={}
45
+ filename = os.path.join(os.path.dirname(__file__),mapping_file)
46
+ fic = open(filename)
47
+ for line in fic:
48
+ fields = line.strip().split()
49
+ map[fields[0]] = fields[1]
50
+ fic.close()
51
+ return map
52
+
53
+
54
+
55
+ if __name__=='__main__':
56
+ this_folder = os.path.dirname(os.path.realpath(__file__))
57
+
58
+ if sys.stdin.isatty():
59
+ print>>sys.stderr,'Input stream required.'
60
+ print>>sys.stderr,'Example usage: cat myUTF8file.kaf |',sys.argv[0]
61
+ sys.exit(-1)
62
+
63
+ time_stamp = True
64
+ try:
65
+ opts, args = getopt.getopt(sys.argv[1:],"l:",["no-time"])
66
+ for opt, arg in opts:
67
+ if opt == "--no-time":
68
+ time_stamp = False
69
+ except getopt.GetoptError:
70
+ pass
71
+
72
+
73
+ input_kaf = KafParser(sys.stdin)
74
+ my_lang = input_kaf.getLanguage()
75
+
76
+
77
+ if my_lang == 'en':
78
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-english-utf8'
79
+ mapping_file = this_folder +'/english.map.treetagger.kaf.csv'
80
+ model = 'English models'
81
+ elif my_lang == 'nl':
82
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-dutch-utf8'
83
+ mapping_file = this_folder +'/dutch.map.treetagger.kaf.csv'
84
+ model = 'Dutch models'
85
+ elif my_lang == 'de':
86
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-german-utf8'
87
+ mapping_file = this_folder +'/german.map.treetagger.kaf.csv'
88
+ model = 'German models'
89
+ elif my_lang == 'fr':
90
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-french-utf8'
91
+ mapping_file = this_folder +'/french.map.treetagger.kaf.csv'
92
+ model = 'French models'
93
+ elif my_lang == 'it':
94
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-italian-utf8'
95
+ mapping_file = this_folder +'/italian.map.treetagger.kaf.csv'
96
+ model = 'Italian models'
97
+ elif my_lang == 'es':
98
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-spanish-utf8'
99
+ mapping_file = this_folder +'/spanish.map.treetagger.kaf.csv'
100
+ model = 'Spanish models'
101
+ else: ## Default is dutch
102
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-dutch-utf8'
103
+ mapping_file = this_folder +'/dutch.map.treetagger.kaf.csv'
104
+ model = 'Dutch models'
105
+
106
+ map_tt_to_kaf = loadMapping(mapping_file)
107
+
108
+
109
+ ## Create the input text for
110
+ reference_tokens = []
111
+ sentences = []
112
+ prev_sent='-200'
113
+ aux = []
114
+ for word, sent_id, w_id in input_kaf.getTokens():
115
+ if sent_id != prev_sent:
116
+ if len(aux) != 0:
117
+ sentences.append(aux)
118
+ aux = []
119
+ aux.append((word,w_id))
120
+
121
+ prev_sent = sent_id
122
+ if len(aux)!=0:
123
+ sentences.append(aux)
124
+
125
+
126
+ for sentence in sentences:
127
+ #print>>sys.stderr,'Input sentnece:',sentence
128
+ text = ' '.join(t.encode('utf-8') for t,_ in sentence)
129
+
130
+ if not os.path.isfile(treetagger_cmd):
131
+ print>>sys.stderr, "Can't find the proper tree tagger command: " +treetagger_cmd
132
+ raise IOError(treetagger_cmd)
133
+ try:
134
+ tt_proc = subprocess.Popen(treetagger_cmd,stdin=subprocess.PIPE, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
135
+ except Exception as e:
136
+ print>>sys.stderr,str(e)
137
+
138
+ out, err = tt_proc.communicate(text)
139
+
140
+ #print>>sys.stderr,'Output treetagger',out
141
+ data = {}
142
+ new_tokens = []
143
+ for n,line in enumerate(out.splitlines()):
144
+ line = line.decode('utf-8')
145
+ my_id='t_'+str(n)
146
+ token,pos,lemma = line.strip().split('\t')
147
+ pos_kaf = map_tt_to_kaf.get(pos,'O')
148
+
149
+ if lemma=='<unknown>':
150
+ lemma=token
151
+ pos+=' unknown_lemma'
152
+ if pos_kaf in ['N','R','G','V','A','O']:
153
+ type_term = 'open'
154
+ else:
155
+ type_term = 'close'
156
+ data[my_id] = (token,pos_kaf,lemma,type_term,pos)
157
+ new_tokens.append((token,my_id))
158
+ #tt_proc.terminate()
159
+
160
+ mapping_tokens = {}
161
+ #print
162
+ #print 'SENTENCE',sentence
163
+ #print 'New=tokens',new_tokens
164
+ token_matcher(sentence,new_tokens,mapping_tokens)
165
+ #print mapping_tokens
166
+ #print
167
+ new_terms = []
168
+ terms_for_token = {}
169
+ for token_new, id_new in new_tokens:
170
+ token,pos_kaf,lemma,type_term,pos = data[id_new]
171
+ ref_tokens = mapping_tokens[id_new]
172
+ span = []
173
+ #print token_new, id_new, ref_tokens
174
+ for ref_token in ref_tokens:
175
+ span.append(ref_token)
176
+ if ref_token in terms_for_token:
177
+ terms_for_token[ref_token].append(id_new)
178
+ else:
179
+ terms_for_token[ref_token] = [id_new]
180
+
181
+ new_terms.append((id_new,type_term,pos_kaf,pos,lemma,span))
182
+
183
+
184
+ #print terms_for_token
185
+ not_use = set()
186
+ for id_new,type_term,pos_kaf,pos,lemma,span in new_terms:
187
+ #print not_use
188
+ #print id_new
189
+ if id_new not in not_use:
190
+ new_lemma = ''
191
+ for tokenid in span:
192
+ if len(terms_for_token[tokenid]) > 1:
193
+ new_lemma += (''.join(data[t][2] for t in terms_for_token[tokenid])).lower()
194
+ not_use |= set(terms_for_token[tokenid])
195
+ if new_lemma != '':
196
+ lemma = new_lemma
197
+
198
+ ###############
199
+ ele_term = EL('term',attrib={'tid':id_new,
200
+ 'type':type_term,
201
+ 'pos':pos_kaf,
202
+ 'morphofeat':pos,
203
+ 'lemma':lemma})
204
+ ele_span = EL('span')
205
+ for ref_token in span:
206
+ eleTarget = EL('target',attrib={'id':ref_token})
207
+ ele_span.append(eleTarget)
208
+ ele_term.append(ele_span)
209
+ input_kaf.addElementToLayer('terms', ele_term)
210
+ ##End for each sentence
211
+
212
+ input_kaf.addLinguisticProcessor('TreeTagger_from_kaf '+model,'1.0', 'term', time_stamp)
213
+ input_kaf.saveToFile(sys.stdout)
214
+
215
+
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'opener/daemons'
4
+ require_relative '../lib/opener/tree_tagger'
5
+
6
+ options = Opener::Daemons::OptParser.parse!(ARGV)
7
+ daemon = Opener::Daemons::Daemon.new(Opener::TreeTagger, options)
8
+
9
+ daemon.start
@@ -0,0 +1,13 @@
1
+ require 'rake'
2
+ require_relative 'support'
3
+
4
+ desc 'Verifies the requirements'
5
+ task :requirements do
6
+ verify_requirements
7
+ end
8
+
9
+ task :default => :requirements do
10
+ # path = File.join(PYTHON_SITE_PACKAGES, 'pre_install')
11
+ #
12
+ # pip_install(PRE_INSTALL_REQUIREMENTS, path)
13
+ end
@@ -0,0 +1,38 @@
1
+ require 'opener/build-tools'
2
+
3
+ include Opener::BuildTools::Requirements
4
+ include Opener::BuildTools::Python
5
+ include Opener::BuildTools::Files
6
+
7
+ # Directory where packages will be installed to.
8
+ PYTHON_SITE_PACKAGES = File.expand_path(
9
+ '../../../core/site-packages',
10
+ __FILE__
11
+ )
12
+
13
+ # Directory containing the temporary files.
14
+ TMP_DIRECTORY = File.expand_path('../../../tmp', __FILE__)
15
+
16
+ # Path to the pip requirements file used to install requirements before
17
+ # packaging the Gem.
18
+ PRE_BUILD_REQUIREMENTS = File.expand_path(
19
+ '../../../pre_build_requirements.txt',
20
+ __FILE__
21
+ )
22
+
23
+ # Path to the pip requirements file used to install requirements upon Gem
24
+ # installation.
25
+ PRE_INSTALL_REQUIREMENTS = File.expand_path(
26
+ '../../../pre_install_requirements.txt',
27
+ __FILE__
28
+ )
29
+
30
+ ##
31
+ # Verifies the requirements to install thi Gem.
32
+ #
33
+ def verify_requirements
34
+ require_executable('python')
35
+ require_version('python', python_version, '2.6.0')
36
+ require_executable('pip')
37
+ require_version('pip', pip_version, '1.3.1')
38
+ end
@@ -0,0 +1,69 @@
1
+ require 'open3'
2
+ require 'optparse'
3
+
4
+ require_relative 'tree_tagger/version'
5
+ require_relative 'tree_tagger/cli'
6
+
7
+ module Opener
8
+ class TreeTagger
9
+ attr_reader :options, :args
10
+
11
+ ##
12
+ # Hash containing the default options to use.
13
+ #
14
+ # @return [Hash]
15
+ #
16
+ DEFAULT_OPTIONS = {
17
+ :args => []
18
+ }.freeze
19
+
20
+ ##
21
+ # @param [Hash] options
22
+ #
23
+ # @option options [Array] :args Collection of arbitrary arguments to pass
24
+ # to the underlying kernel.
25
+ #
26
+ def initialize(options = {})
27
+ @args = options.delete(:args) || []
28
+ @options = DEFAULT_OPTIONS.merge(options)
29
+ end
30
+
31
+ def run(input)
32
+ stdout, stderr, process = capture(input)
33
+
34
+ if process.success?
35
+ STDERR.puts(stderr) unless stderr.empty?
36
+ else
37
+ abort stderr
38
+ end
39
+
40
+ return stdout, stderr, process
41
+ end
42
+
43
+ def capture(input)
44
+ Open3.capture3(*command.split(" "), :stdin_data=>input)
45
+ end
46
+
47
+ def command
48
+ return "#{adjust_python_path} python -E -OO #{kernel} #{args.join(' ')}"
49
+ end
50
+
51
+ protected
52
+
53
+ ##
54
+ # @return [String]
55
+ #
56
+ def adjust_python_path
57
+ site_packages = File.join(core_dir, 'site-packages')
58
+ "env PYTHONPATH=#{site_packages}:$PYTHONPATH"
59
+ end
60
+
61
+ def core_dir
62
+ File.expand_path("../../core", File.dirname(__FILE__))
63
+ end
64
+
65
+ def kernel
66
+ File.join(core_dir,'/tt_from_kaf_to_kaf.py')
67
+ end
68
+ end
69
+ end