opener-tree-tagger 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +150 -0
  3. data/bin/opener-tree-tagger-daemon +7 -0
  4. data/bin/opener-tree-tagger-server +11 -0
  5. data/bin/tree-tagger +7 -0
  6. data/config.ru +5 -0
  7. data/core/dutch.map.treetagger.kaf.csv +40 -0
  8. data/core/english.map.treetagger.kaf.csv +36 -0
  9. data/core/french.map.treetagger.kaf.csv +33 -0
  10. data/core/german.map.treetagger.kaf.csv +52 -0
  11. data/core/italian.map.treetagger.kaf.csv +38 -0
  12. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  13. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  14. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  15. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  16. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  17. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  18. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  19. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  20. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  21. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  22. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  23. data/core/spanish.map.treetagger.kaf.csv +75 -0
  24. data/core/token_matcher.py +82 -0
  25. data/core/tt_from_kaf_to_kaf.py +215 -0
  26. data/exec/tree-tagger.rb +9 -0
  27. data/ext/hack/Rakefile +13 -0
  28. data/ext/hack/support.rb +38 -0
  29. data/lib/opener/tree_tagger.rb +69 -0
  30. data/lib/opener/tree_tagger/cli.rb +69 -0
  31. data/lib/opener/tree_tagger/public/markdown.css +284 -0
  32. data/lib/opener/tree_tagger/server.rb +16 -0
  33. data/lib/opener/tree_tagger/version.rb +5 -0
  34. data/lib/opener/tree_tagger/views/index.erb +96 -0
  35. data/lib/opener/tree_tagger/views/result.erb +15 -0
  36. data/opener-tree-tagger.gemspec +35 -0
  37. data/pre_build_requirements.txt +1 -0
  38. metadata +197 -0
@@ -0,0 +1,75 @@
1
+ ACRNM O acronym (ISO, CEI)
2
+ ADJ G Adjectives (mayores, mayor)
3
+ ADV A Adverbs (muy, demasiado, c�mo)
4
+ ALFP O Plural letter of the alphabet (As/Aes, bes)
5
+ ALFS O Singular letter of the alphabet (A, b)
6
+ ART D Articles (un, las, la, unas)
7
+ BACKSLASH O backslash (\)
8
+ CARD O Cardinals
9
+ CC C Coordinating conjunction (y, o)
10
+ CCAD C Adversative coordinating conjunction (pero)
11
+ CCNEG C Negative coordinating conjunction (ni)
12
+ CM O comma (,)
13
+ CODE O Alphanumeric code
14
+ COLON O colon (:)
15
+ CQUE C que (as conjunction)
16
+ CSUBF C Subordinating conjunction that introduces finite clauses (apenas)
17
+ CSUBI C Subordinating conjunction that introduces infinite clauses (al)
18
+ CSUBX C Subordinating conjunction underspecified for subord-type (aunque)
19
+ DASH O dash (-)
20
+ DM Q Demonstrative pronouns (�sas, �se, esta)
21
+ DOTS O POS tag for "..."
22
+ FO O Formula
23
+ FS O Full stop punctuation marks
24
+ INT Q Interrogative pronouns (qui�nes, cu�ntas, cu�nto)
25
+ ITJN O Interjection (oh, ja)
26
+ LP O left parenthesis ("(", "[")
27
+ NC N Common nouns (mesas, mesa, libro, ordenador)
28
+ NEG O Negation
29
+ NMEA N measure noun (metros, litros)
30
+ NMON N month name
31
+ NP R Proper nouns
32
+ ORD O Ordinals (primer, primeras, primera)
33
+ PAL O Portmanteau word formed by a and el
34
+ PDEL O Portmanteau word formed by de and el
35
+ PE O Foreign word
36
+ PERCT O percent sign (%)
37
+ PNC O Unclassified word
38
+ PPC Q Clitic personal pronoun (le, les)
39
+ PPO Q Possessive pronouns (mi, su, sus)
40
+ PPX Q Clitics and personal pronouns (nos, me, nosotras, te, s�)
41
+ PREP O Negative preposition (sin)
42
+ PREP O Preposition
43
+ PREP/DEL O Complex preposition "despu�s del"
44
+ QT O quotation symbol (" ' `)
45
+ QU O Quantifiers (sendas, cada)
46
+ REL Q Relative pronouns (cuyas, cuyo)
47
+ RP O right parenthesis (")", "]")
48
+ SE O Se (as particle)
49
+ SEMICOLON O semicolon (;)
50
+ SLASH O slash (/)
51
+ SYM O Symbols
52
+ UMMX N measure unit (MHz, km, mA)
53
+ VCLIger V clitic gerund verb
54
+ VCLIinf V clitic infinitive verb
55
+ VCLIfin V clitic finite verb
56
+ VEadj V Verb estar. Past participle
57
+ VEfin V Verb estar. Finite
58
+ VEger V Verb estar. Gerund
59
+ VEinf V Verb estar. Infinitive
60
+ VHadj V Verb haber. Past participle
61
+ VHfin V Verb haber. Finite
62
+ VHger V Verb haber. Gerund
63
+ VHinf V Verb haber. Infinitive
64
+ VLadj V Lexical verb. Past participle
65
+ VLfin V Lexical verb. Finite
66
+ VLger V Lexical verb. Gerund
67
+ VLinf V Lexical verb. Infinitive
68
+ VMadj V Modal verb. Past participle
69
+ VMfin V Modal verb. Finite
70
+ VMger V Modal verb. Gerund
71
+ VMinf V Modal verb. Infinitive
72
+ VSadj V Verb ser. Past participle
73
+ VSfin V Verb ser. Finite
74
+ VSger V Verb ser. Gerund
75
+ VSinf V Verb ser. Infinitive
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env python
2
+
3
+
4
+ #####
5
+ # 4-Mar-2013 : modified order of rules to check first if there is a merge and then if it is an extra token
6
+ # becuase of this case, where can be both: [ .. . ] [ . . . ]
7
+
8
+
9
+ def add_match(d,id_new,id_ref):
10
+ if id_new in d:
11
+ d[id_new].append(id_ref)
12
+ else:
13
+ d[id_new]=[id_ref]
14
+
15
+
16
+ def token_matcher(l_ref,l_new,super_d):
17
+ debug = False
18
+ if debug:
19
+ print l_ref
20
+ print l_new
21
+ if len(l_new)==0:
22
+ return
23
+ else:
24
+ token_ref, id_ref = l_ref[0]
25
+ token_new, id_new = l_new[0]
26
+ if token_ref == token_new:
27
+ if debug: print 'Matching ',l_ref[0],l_new[0]
28
+ if debug: print 'A',l_ref[0],l_new[0]
29
+ add_match(super_d,id_new,id_ref)
30
+ token_matcher(l_ref[1:],l_new[1:],super_d)
31
+ else:
32
+ if token_ref.startswith(token_new) : ##There was an split
33
+ if debug: print 'D'
34
+ aux = (token_ref[len(token_new):],id_ref)
35
+ l_ref[0]=aux
36
+
37
+ add_match(super_d,id_new,id_ref)
38
+ token_matcher(l_ref,l_new[1:],super_d)
39
+
40
+ elif token_new.startswith(token_ref) : ##There was a merge
41
+ if debug: print 'E'
42
+ aux = (token_new[len(token_ref)+1:],id_new)
43
+ l_new[0]=aux
44
+ add_match(super_d,id_new,id_ref)
45
+ token_matcher(l_ref[1:],l_new,super_d)
46
+
47
+
48
+ elif len(l_new)>1 and l_new[1][0]==token_ref: ## There is an extra token in l_new
49
+ if debug: print 'B',l_new[1][0],token_ref
50
+ token_matcher(l_ref[0:],l_new[1:],super_d)
51
+
52
+
53
+ elif len(l_ref)>1 and l_ref[1][0] == token_new: ## There is an extra token in l_ref
54
+ if debug: print 'C',l_ref[1:],l_new[0:]
55
+ token_matcher(l_ref[1:],l_new[0:],super_d)
56
+
57
+
58
+ else: ## Imposible matching
59
+ if debug: print 'F'
60
+ if debug: print 'Impossible match of ',l_new[0],l_ref[0]
61
+ token_matcher(l_ref[1:],l_new[1:],super_d)
62
+
63
+
64
+ if __name__ == '__main__':
65
+ l1 = []
66
+ s1 = 'Beatrix Wilhelmina Armgard van Oranje -Nassau (Baarn , 31 januari 1938 ) is sinds 30 april 1980 koningin van het Koninkrijk der Nederlanden'
67
+
68
+ s1 = 'Th is is a very simple example'
69
+ for n,t in enumerate(s1.split(' ')):
70
+ l1.append((t,'id'+str(n)))
71
+
72
+ l2 = []
73
+ #s2 = 'Beatrix Wilhelmina Armgard van Oranje -Nassau ( Baarn , 31 januari 1938 ) is sinds 30 april 1980 koningin van het Koninkrijk der Nederlanden'
74
+ s2 = 'This is a very sim ple example'
75
+ for n,t in enumerate(s2.split(' ')):
76
+ l2.append((t,'id'+str(n)))
77
+
78
+ super_d = {}
79
+ token_matcher(l1,l2,super_d)
80
+ print l1
81
+ print l2
82
+ print super_d
@@ -0,0 +1,215 @@
1
+ #!/usr/bin/env python
2
+ #-*- coding: utf8 *-*
3
+ __version__ = '1.2 4-Mar-2013'
4
+
5
+ ## Last changes
6
+ # 1-Mar-2013 --> now it works with UTF-8 !!!
7
+ # 4-Mar-2013 --> added code for including the element in the linguistic processores header
8
+ # 5-Mar-2013 --> language is not a parameter, is read from the input KAF
9
+ # 9-dec-2013 --> the postagger avoids 2 terms with the same tokenid span, like 's --> ' and s
10
+ # 11-mar-2014 --> fixed problem when merge with token_matcher
11
+ ###################################
12
+
13
+
14
+ import sys
15
+ import os
16
+
17
+ this_folder = os.path.dirname(os.path.realpath(__file__))
18
+
19
+ # This updates the load path to ensure that the local site-packages directory
20
+ # can be used to load packages (e.g. a locally installed copy of lxml).
21
+ sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
22
+
23
+ import operator
24
+ import time
25
+ import getopt
26
+ import string
27
+ import subprocess
28
+ import lxml
29
+ from lxml import etree
30
+ from lxml.etree import ElementTree as ET, Element as EL, PI
31
+ from VUKafParserPy.KafParserMod import KafParser
32
+ from token_matcher import token_matcher
33
+
34
+
35
+
36
+ if not os.environ.get('TREE_TAGGER_PATH'):
37
+ print>>sys.stderr,"TREE_TAGGER_PATH environment variable not found. Please set the full path to your tree tagger in the TREE_TAGGER_PATH environent variable."
38
+ sys.exit(-1)
39
+
40
+ complete_path_to_treetagger = os.environ.get('TREE_TAGGER_PATH')
41
+
42
+
43
+ def loadMapping(mapping_file):
44
+ map={}
45
+ filename = os.path.join(os.path.dirname(__file__),mapping_file)
46
+ fic = open(filename)
47
+ for line in fic:
48
+ fields = line.strip().split()
49
+ map[fields[0]] = fields[1]
50
+ fic.close()
51
+ return map
52
+
53
+
54
+
55
+ if __name__=='__main__':
56
+ this_folder = os.path.dirname(os.path.realpath(__file__))
57
+
58
+ if sys.stdin.isatty():
59
+ print>>sys.stderr,'Input stream required.'
60
+ print>>sys.stderr,'Example usage: cat myUTF8file.kaf |',sys.argv[0]
61
+ sys.exit(-1)
62
+
63
+ time_stamp = True
64
+ try:
65
+ opts, args = getopt.getopt(sys.argv[1:],"l:",["no-time"])
66
+ for opt, arg in opts:
67
+ if opt == "--no-time":
68
+ time_stamp = False
69
+ except getopt.GetoptError:
70
+ pass
71
+
72
+
73
+ input_kaf = KafParser(sys.stdin)
74
+ my_lang = input_kaf.getLanguage()
75
+
76
+
77
+ if my_lang == 'en':
78
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-english-utf8'
79
+ mapping_file = this_folder +'/english.map.treetagger.kaf.csv'
80
+ model = 'English models'
81
+ elif my_lang == 'nl':
82
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-dutch-utf8'
83
+ mapping_file = this_folder +'/dutch.map.treetagger.kaf.csv'
84
+ model = 'Dutch models'
85
+ elif my_lang == 'de':
86
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-german-utf8'
87
+ mapping_file = this_folder +'/german.map.treetagger.kaf.csv'
88
+ model = 'German models'
89
+ elif my_lang == 'fr':
90
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-french-utf8'
91
+ mapping_file = this_folder +'/french.map.treetagger.kaf.csv'
92
+ model = 'French models'
93
+ elif my_lang == 'it':
94
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-italian-utf8'
95
+ mapping_file = this_folder +'/italian.map.treetagger.kaf.csv'
96
+ model = 'Italian models'
97
+ elif my_lang == 'es':
98
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-spanish-utf8'
99
+ mapping_file = this_folder +'/spanish.map.treetagger.kaf.csv'
100
+ model = 'Spanish models'
101
+ else: ## Default is dutch
102
+ treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-dutch-utf8'
103
+ mapping_file = this_folder +'/dutch.map.treetagger.kaf.csv'
104
+ model = 'Dutch models'
105
+
106
+ map_tt_to_kaf = loadMapping(mapping_file)
107
+
108
+
109
+ ## Create the input text for
110
+ reference_tokens = []
111
+ sentences = []
112
+ prev_sent='-200'
113
+ aux = []
114
+ for word, sent_id, w_id in input_kaf.getTokens():
115
+ if sent_id != prev_sent:
116
+ if len(aux) != 0:
117
+ sentences.append(aux)
118
+ aux = []
119
+ aux.append((word,w_id))
120
+
121
+ prev_sent = sent_id
122
+ if len(aux)!=0:
123
+ sentences.append(aux)
124
+
125
+
126
+ for sentence in sentences:
127
+ #print>>sys.stderr,'Input sentnece:',sentence
128
+ text = ' '.join(t.encode('utf-8') for t,_ in sentence)
129
+
130
+ if not os.path.isfile(treetagger_cmd):
131
+ print>>sys.stderr, "Can't find the proper tree tagger command: " +treetagger_cmd
132
+ raise IOError(treetagger_cmd)
133
+ try:
134
+ tt_proc = subprocess.Popen(treetagger_cmd,stdin=subprocess.PIPE, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
135
+ except Exception as e:
136
+ print>>sys.stderr,str(e)
137
+
138
+ out, err = tt_proc.communicate(text)
139
+
140
+ #print>>sys.stderr,'Output treetagger',out
141
+ data = {}
142
+ new_tokens = []
143
+ for n,line in enumerate(out.splitlines()):
144
+ line = line.decode('utf-8')
145
+ my_id='t_'+str(n)
146
+ token,pos,lemma = line.strip().split('\t')
147
+ pos_kaf = map_tt_to_kaf.get(pos,'O')
148
+
149
+ if lemma=='<unknown>':
150
+ lemma=token
151
+ pos+=' unknown_lemma'
152
+ if pos_kaf in ['N','R','G','V','A','O']:
153
+ type_term = 'open'
154
+ else:
155
+ type_term = 'close'
156
+ data[my_id] = (token,pos_kaf,lemma,type_term,pos)
157
+ new_tokens.append((token,my_id))
158
+ #tt_proc.terminate()
159
+
160
+ mapping_tokens = {}
161
+ #print
162
+ #print 'SENTENCE',sentence
163
+ #print 'New=tokens',new_tokens
164
+ token_matcher(sentence,new_tokens,mapping_tokens)
165
+ #print mapping_tokens
166
+ #print
167
+ new_terms = []
168
+ terms_for_token = {}
169
+ for token_new, id_new in new_tokens:
170
+ token,pos_kaf,lemma,type_term,pos = data[id_new]
171
+ ref_tokens = mapping_tokens[id_new]
172
+ span = []
173
+ #print token_new, id_new, ref_tokens
174
+ for ref_token in ref_tokens:
175
+ span.append(ref_token)
176
+ if ref_token in terms_for_token:
177
+ terms_for_token[ref_token].append(id_new)
178
+ else:
179
+ terms_for_token[ref_token] = [id_new]
180
+
181
+ new_terms.append((id_new,type_term,pos_kaf,pos,lemma,span))
182
+
183
+
184
+ #print terms_for_token
185
+ not_use = set()
186
+ for id_new,type_term,pos_kaf,pos,lemma,span in new_terms:
187
+ #print not_use
188
+ #print id_new
189
+ if id_new not in not_use:
190
+ new_lemma = ''
191
+ for tokenid in span:
192
+ if len(terms_for_token[tokenid]) > 1:
193
+ new_lemma += (''.join(data[t][2] for t in terms_for_token[tokenid])).lower()
194
+ not_use |= set(terms_for_token[tokenid])
195
+ if new_lemma != '':
196
+ lemma = new_lemma
197
+
198
+ ###############
199
+ ele_term = EL('term',attrib={'tid':id_new,
200
+ 'type':type_term,
201
+ 'pos':pos_kaf,
202
+ 'morphofeat':pos,
203
+ 'lemma':lemma})
204
+ ele_span = EL('span')
205
+ for ref_token in span:
206
+ eleTarget = EL('target',attrib={'id':ref_token})
207
+ ele_span.append(eleTarget)
208
+ ele_term.append(ele_span)
209
+ input_kaf.addElementToLayer('terms', ele_term)
210
+ ##End for each sentence
211
+
212
+ input_kaf.addLinguisticProcessor('TreeTagger_from_kaf '+model,'1.0', 'term', time_stamp)
213
+ input_kaf.saveToFile(sys.stdout)
214
+
215
+
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'opener/daemons'
4
+ require_relative '../lib/opener/tree_tagger'
5
+
6
+ options = Opener::Daemons::OptParser.parse!(ARGV)
7
+ daemon = Opener::Daemons::Daemon.new(Opener::TreeTagger, options)
8
+
9
+ daemon.start
@@ -0,0 +1,13 @@
1
+ require 'rake'
2
+ require_relative 'support'
3
+
4
+ desc 'Verifies the requirements'
5
+ task :requirements do
6
+ verify_requirements
7
+ end
8
+
9
+ task :default => :requirements do
10
+ # path = File.join(PYTHON_SITE_PACKAGES, 'pre_install')
11
+ #
12
+ # pip_install(PRE_INSTALL_REQUIREMENTS, path)
13
+ end
@@ -0,0 +1,38 @@
1
+ require 'opener/build-tools'
2
+
3
+ include Opener::BuildTools::Requirements
4
+ include Opener::BuildTools::Python
5
+ include Opener::BuildTools::Files
6
+
7
+ # Directory where packages will be installed to.
8
+ PYTHON_SITE_PACKAGES = File.expand_path(
9
+ '../../../core/site-packages',
10
+ __FILE__
11
+ )
12
+
13
+ # Directory containing the temporary files.
14
+ TMP_DIRECTORY = File.expand_path('../../../tmp', __FILE__)
15
+
16
+ # Path to the pip requirements file used to install requirements before
17
+ # packaging the Gem.
18
+ PRE_BUILD_REQUIREMENTS = File.expand_path(
19
+ '../../../pre_build_requirements.txt',
20
+ __FILE__
21
+ )
22
+
23
+ # Path to the pip requirements file used to install requirements upon Gem
24
+ # installation.
25
+ PRE_INSTALL_REQUIREMENTS = File.expand_path(
26
+ '../../../pre_install_requirements.txt',
27
+ __FILE__
28
+ )
29
+
30
+ ##
31
+ # Verifies the requirements to install thi Gem.
32
+ #
33
+ def verify_requirements
34
+ require_executable('python')
35
+ require_version('python', python_version, '2.6.0')
36
+ require_executable('pip')
37
+ require_version('pip', pip_version, '1.3.1')
38
+ end
@@ -0,0 +1,69 @@
1
+ require 'open3'
2
+ require 'optparse'
3
+
4
+ require_relative 'tree_tagger/version'
5
+ require_relative 'tree_tagger/cli'
6
+
7
+ module Opener
8
+ class TreeTagger
9
+ attr_reader :options, :args
10
+
11
+ ##
12
+ # Hash containing the default options to use.
13
+ #
14
+ # @return [Hash]
15
+ #
16
+ DEFAULT_OPTIONS = {
17
+ :args => []
18
+ }.freeze
19
+
20
+ ##
21
+ # @param [Hash] options
22
+ #
23
+ # @option options [Array] :args Collection of arbitrary arguments to pass
24
+ # to the underlying kernel.
25
+ #
26
+ def initialize(options = {})
27
+ @args = options.delete(:args) || []
28
+ @options = DEFAULT_OPTIONS.merge(options)
29
+ end
30
+
31
+ def run(input)
32
+ stdout, stderr, process = capture(input)
33
+
34
+ if process.success?
35
+ STDERR.puts(stderr) unless stderr.empty?
36
+ else
37
+ abort stderr
38
+ end
39
+
40
+ return stdout, stderr, process
41
+ end
42
+
43
+ def capture(input)
44
+ Open3.capture3(*command.split(" "), :stdin_data=>input)
45
+ end
46
+
47
+ def command
48
+ return "#{adjust_python_path} python -E -OO #{kernel} #{args.join(' ')}"
49
+ end
50
+
51
+ protected
52
+
53
+ ##
54
+ # @return [String]
55
+ #
56
+ def adjust_python_path
57
+ site_packages = File.join(core_dir, 'site-packages')
58
+ "env PYTHONPATH=#{site_packages}:$PYTHONPATH"
59
+ end
60
+
61
+ def core_dir
62
+ File.expand_path("../../core", File.dirname(__FILE__))
63
+ end
64
+
65
+ def kernel
66
+ File.join(core_dir,'/tt_from_kaf_to_kaf.py')
67
+ end
68
+ end
69
+ end