opener-pos-tagger-base 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +110 -0
  3. data/bin/pos-tagger-base +21 -0
  4. data/core/mapping.postag.stss.to.opener.csv +52 -0
  5. data/core/mapping.postag.wotan.to.opener.csv +13 -0
  6. data/core/opennlp/bin/opennlp +35 -0
  7. data/core/opennlp/bin/opennlp.bat +35 -0
  8. data/core/opennlp/lib/jwnl-1.3.3.jar +0 -0
  9. data/core/opennlp/lib/opennlp-maxent-3.0.2-incubating.jar +0 -0
  10. data/core/opennlp/lib/opennlp-tools-1.5.2-incubating.jar +0 -0
  11. data/core/opennlp/lib/opennlp-uima-1.5.2-incubating.jar +0 -0
  12. data/core/opennlp/models/de-pos-maxent.bin +0 -0
  13. data/core/opennlp/models/de-pos-perceptron.bin +0 -0
  14. data/core/opennlp/models/nl-pos-maxent.bin +0 -0
  15. data/core/opennlp/models/nl-pos-perceptron.bin +0 -0
  16. data/core/pos-tagger_open-nlp.py +160 -0
  17. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  18. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  19. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  20. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  21. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  22. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  23. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  24. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  25. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  26. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  27. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  28. data/core/token_matcher.py +80 -0
  29. data/ext/hack/support.rb +38 -0
  30. data/lib/opener/pos_taggers/base.rb +90 -0
  31. data/lib/opener/pos_taggers/base/version.rb +7 -0
  32. data/opener-pos-tagger-base.gemspec +29 -0
  33. data/pre_build_requirements.txt +1 -0
  34. metadata +132 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e1d01b280c3f2369e20c811fa11a42150b41cc16
4
+ data.tar.gz: 7639fb3ce4fb64641047659339b500157940087c
5
+ SHA512:
6
+ metadata.gz: 31dd9808cc4b3ce95de10c8e95c456af963a2b93cc1bbef60e9917716b9de830ce3b749c28b63017ab7ce90393172cf499563400fbb62873a39eb0be2d0e2f1a
7
+ data.tar.gz: 7b9ab3549277fc1c93b09b60eae00f9b56894dee3565550f9eee7d82209316d770083df32085885282da18a3c12475b1e4c897f9c8fd928f2c144841bea88e3d
@@ -0,0 +1,110 @@
1
+ [![Build Status](https://drone.io/github.com/opener-project/pos-tagger-base/status.png)](https://drone.io/github.com/opener-project/pos-tagger-base/latest)
2
+
3
+ # Base POS Tagger
4
+
5
+ This repository contains the source code (both Ruby and Python) for the base
6
+ POS tagger. Currently this tagger supports the following languages:
7
+
8
+ * Dutch
9
+ * German
10
+
11
+ ## Requirements
12
+
13
+ * Python 2.7.0 or newer
14
+ * Ruby 1.9.2 or newer
15
+ * pip
16
+ * libxml2
17
+
18
+ ## Installation
19
+
20
+ Using Bundler:
21
+
22
+ gem 'opener-pos-tagger-base',
23
+ :git => 'git@github.com:opener-project/pos-tagger-base.git',
24
+ :branch => 'master'
25
+
26
+ Using `specific_install`:
27
+
28
+ gem install specific_install
29
+ gem specific_install opener-pos-tagger-base \
30
+ -l https://github.com/opener-project/pos-tagger-base.git
31
+
32
+ Using regular RubyGems (once the Gem is available):
33
+
34
+ gem install opener-pos-tagger-base
35
+
36
+ ## Usage
37
+
38
+ Tagging a KAF file:
39
+
40
+ cat some_input_file.kaf | pos-tagger-base
41
+
42
+ ## Contributing
43
+
44
+ First make sure all the required dependencies are installed:
45
+
46
+ bundle install
47
+
48
+ Then download the required Python code:
49
+
50
+ bundle exec rake compile
51
+
52
+ Once this is done continue reading the sections below to get a better
53
+ understanding about the repository structure.
54
+
55
+ ## Structure
56
+
57
+ This repository comes in two parts: a collection of Python source files and
58
+ Ruby source code. The Python code can be found in `core/`, the Ruby code can be
59
+ found in the other directories (e.g. `lib/`).
60
+
61
+ Required Python packages are installed locally in to `core/site-packages/X`
62
+ where X is one of the following two:
63
+
64
+ * `pre_build`: contains packages that are installed before building the Gem,
65
+ these packages are shipped with the Gem
66
+ * `pre_install`: contains packages that are installed in to this directory upon
67
+ installing the Gem. This directory should exclusively be used for compiled
68
+ Python packages such as lxml.
69
+
70
+ There are also two requirements files for pip:
71
+
72
+ * `pre_build_requirements.txt`: installs the requirements for the `pre_build`
73
+ directory.
74
+ * `pre_install_requirements.txt`: installs the requirements for the
75
+ `pre_install` directory.
76
+
77
+ To easily install all the required dependencies (required for running the tests
78
+ for example) run the following:
79
+
80
+ bundle exec rake compile
81
+
82
+ This will take care of verifying the requirements and downloading and
83
+ installing the Python packages.
84
+
85
+ ## Testing
86
+
87
+ To run the tests (which are powered by Cucumber), simply run the following:
88
+
89
+ bundle exec rake
90
+
91
+ This will take care of verifying the requirements, installing the Python code
92
+ and running the tests.
93
+
94
+ For more information on the available Rake tasks run the following:
95
+
96
+ bundle exec rake -T
97
+
98
+ ## POS Details
99
+
100
+ ### POS-tags models
101
+
102
+ * [Dutch-maxent](http://opennlp.sourceforge.net/models-1.5/nl-pos-maxent.bin)
103
+ * [Dutch-perceptron](http://opennlp.sourceforge.net/models-1.5/nl-pos-perceptron.bin)
104
+ * [German-maxent](http://opennlp.sourceforge.net/models-1.5/de-pos-maxent.bin)
105
+ * [German-perceptron](http://opennlp.sourceforge.net/models-1.5/de-pos-perceptron.bin)
106
+
107
+ ### POS-tags sets
108
+
109
+ * Dutch: trained on conllx alpino data, wotan tagset
110
+ * German: trained on TIGER corpus, STSS tagset
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/base'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::Base.new(:args => ARGV)
13
+ stdout, stderr, process = kernel.run(input)
14
+
15
+ if process.success?
16
+ puts stdout
17
+
18
+ STDERR.puts(stderr) unless stderr.empty?
19
+ else
20
+ abort stderr
21
+ end
@@ -0,0 +1,52 @@
1
+ ADJA G ("Attributives Adjektiv"),
2
+ ADJD G ("Adverbiales oder pr�dikatives Adjektiv"),
3
+ ADV A ("Adverb"),
4
+ APPR P ("Pr�position; Zirkumposition links"),
5
+ APPRART P ("Pr�position mit Artikel"),
6
+ APPO P ("Postposition"),
7
+ APZR P ("Zirkumposition rechts"),
8
+ ART D ("Bestimmer oder unbestimmer Artikel"),
9
+ CARD O ("Kardinalzahl"),
10
+ FM O ("Fremdsprachichles Material"),
11
+ ITJ O ("Interjektion"),
12
+ KOUI C ("unterordnende Konjunktion mit zu und Infinitiv"),
13
+ KOUS C ("unterordnende Konjunktion mit Satz"),
14
+ KON C ("nebenordnende Konjunktion"),
15
+ KOKOM C ("Vergleichskonjunktion"),
16
+ NN N ("normales Nomen"),
17
+ NE R ("Eigennamen"),
18
+ PDS Q ("substituierendes Demonstrativpronomen"),
19
+ PDAT Q ("attribuierendes Demonstrativpronomen"),
20
+ PIS Q ("substituierendes Indefinitpronomen"),
21
+ PIAT Q ("attribuierendes Indefinitpronomen ohne Determiner"),
22
+ PIDAT Q ("attribuierendes Indefinitpronomen mit Determiner"),
23
+ PPER Q ("irreflexives Personalpronomen"),
24
+ PPOSS Q ("substituierendes Possessivpronomen"),
25
+ PPOSAT Q ("attribuierendes Possessivpronomen"),
26
+ PRELS Q ("substituierendes Relativpronomen"),
27
+ PRELAT Q ("attribuierendes Relativpronomen"),
28
+ PRF Q ("reflexives Personalpronomen"),
29
+ PWS Q ("substituierendes Interrogativpronomen"),
30
+ PWAT Q ("attribuierendes Interrogativpronomen"),
31
+ PWAV Q ("adverbiales Interrogativ- oder Relativpronomen"),
32
+ PAV Q ("Pronominaladverb"),
33
+ PTKZU O ("zu vor Infinitiv"),
34
+ PTKNEG O ("Negationspartike"),
35
+ PTKVZ V ("abgetrennter Verbzusatz"),
36
+ PTKANT O ("Antwortpartikel"),
37
+ PTKA O ("Partikel bei Adjektiv oder Adverb"),
38
+ TRUNC N ("Kompositions-Erstglied"),
39
+ VVFIN V ("finites Verb, voll"),
40
+ VVIMP V ("Imperativ, voll"),
41
+ VVINF V ("Infinitiv"),
42
+ VVIZU V ("Infinitiv mit zu"),
43
+ VVPP V ("Partizip Perfekt"),
44
+ VAFIN V ("finites Verb, aux"),
45
+ VAIMP V ("Imperativ, aux"),
46
+ VAINF V ("Infinitiv, aux"),
47
+ VAPP V ("Partizip Perfekt"),
48
+ VMFIN V ("finites Verb, modal"),
49
+ VMINF V ("Infinitiv, modal"),
50
+ VMPP V ("Partizip Perfekt, modal"),
51
+ XY O ("Nichtwort, Sonderzeichen"),
52
+ UNDEFINED O ("Nicht definiert, zb. Satzzeichen");
@@ -0,0 +1,13 @@
1
+ Adj G Adjective
2
+ Adv A Adverb
3
+ Art D Article determiner
4
+ Conj C Conjunction
5
+ Int O Interjection
6
+ N N Noun
7
+ Num O Numeral
8
+ Misc O Miscelaneous
9
+ Prep P Preposition
10
+ Pron Q Pronoun
11
+ Punc O Punctuation
12
+ V V Verb
13
+
@@ -0,0 +1,35 @@
1
+ #!/bin/sh
2
+
3
+ # Licensed to the Apache Software Foundation (ASF) under one
4
+ # or more contributor license agreements. See the NOTICE file
5
+ # distributed with this work for additional information
6
+ # regarding copyright ownership. The ASF licenses this file
7
+ # to you under the Apache License, Version 2.0 (the
8
+ # "License"); you may not use this file except in compliance
9
+ # with the License. You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing,
14
+ # software distributed under the License is distributed on an
15
+ # # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
+ # KIND, either express or implied. See the License for the
17
+ # specific language governing permissions and limitations
18
+ # under the License.
19
+
20
+ # Note: Do not output anything in this script file, any output
21
+ # may be inadvertantly placed in any output files if
22
+ # output redirection is used.
23
+
24
+ if [ -z "$JAVACMD" ] ; then
25
+ if [ -n "$JAVA_HOME" ] ; then
26
+ JAVACMD="$JAVA_HOME/bin/java"
27
+ else
28
+ JAVACMD="`which java`"
29
+ fi
30
+ fi
31
+
32
+ # Might fail if $0 is a link
33
+ OPENNLP_HOME=`dirname "$0"`/..
34
+
35
+ $JAVACMD -Xmx1024m -jar $OPENNLP_HOME/lib/opennlp-tools-*.jar $@
@@ -0,0 +1,35 @@
1
+ @ECHO off
2
+
3
+ REM # Licensed to the Apache Software Foundation (ASF) under one
4
+ REM # or more contributor license agreements. See the NOTICE file
5
+ REM # distributed with this work for additional information
6
+ REM # regarding copyright ownership. The ASF licenses this file
7
+ REM # to you under the Apache License, Version 2.0 (the
8
+ REM # "License"); you may not use this file except in compliance
9
+ REM # with the License. You may obtain a copy of the License at
10
+ REM #
11
+ REM # http://www.apache.org/licenses/LICENSE-2.0
12
+ REM #
13
+ REM # Unless required by applicable law or agreed to in writing,
14
+ REM # software distributed under the License is distributed on an
15
+ REM # # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
+ REM # KIND, either express or implied. See the License for the
17
+ REM # specific language governing permissions and limitations
18
+ REM # under the License.
19
+
20
+ REM # Note: Do not output anything in this script file, any output
21
+ REM # may be inadvertantly placed in any output files if
22
+ REM # output redirection is used.
23
+
24
+ IF "%JAVA_CMD%" == "" (
25
+ IF "%JAVA_HOME%" == "" (
26
+ SET JAVA_CMD=java
27
+ ) ELSE (
28
+ SET JAVA_CMD=%JAVA_HOME%\bin\java
29
+ )
30
+ )
31
+
32
+ REM # Should work with Windows XP and greater. If not, specify the path to where it is installed.
33
+ IF "%OPENNLP_HOME%" == "" SET OPENNLP_HOME=%~sp0..
34
+
35
+ %JAVA_CMD% -Xmx4096m -jar %OPENNLP_HOME%\lib\opennlp-tools-*.jar %*
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/env python
2
+ #-*- coding: utf-8 *-*
3
+ # Ruben Izquierdo
4
+ # Vrije University of Amsterdam
5
+
6
+ import os
7
+ import sys
8
+ import operator
9
+ import time
10
+ import getopt
11
+ import string
12
+ import subprocess
13
+
14
+ os.environ["LC_CTYPE"] = 'en_US.UTF-8'
15
+
16
+ this_folder = os.path.dirname(os.path.realpath(__file__))
17
+ opennlp_folder = os.path.join(this_folder, 'opennlp')
18
+ model_folder = os.path.join(opennlp_folder, 'models')
19
+
20
+ # This updates the load path to ensure that the local site-packages directory
21
+ # can be used to load packages (e.g. a locally installed copy of lxml).
22
+ sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
23
+ sys.path.append(os.path.join(this_folder, 'site-packages/pre_install'))
24
+
25
+ # Config for Dutch
26
+ pos_model_nl = 'nl-pos-maxent.bin'
27
+ mapping_pos_filename_nl = 'mapping.postag.wotan.to.opener.csv'
28
+
29
+ # Config for German
30
+ pos_model_de = 'de-pos-maxent.bin'
31
+ mapping_pos_filename_de = 'mapping.postag.stss.to.opener.csv'
32
+
33
+ mapping_postag_to_kaf = None
34
+ mapping_pos_filename = ""
35
+ __version__ = '2-May-2013'
36
+
37
+ from lxml.etree import ElementTree as ET, Element as EL, PI
38
+ from VUKafParserPy.KafParserMod import KafParser
39
+ from token_matcher import token_matcher
40
+
41
+ def map_pos_tag(pos):
42
+ global mapping_postag_to_kaf
43
+ if mapping_postag_to_kaf is None:
44
+ mapping_postag_to_kaf = {}
45
+ file_mapping = os.path.join(this_folder,mapping_pos_filename)
46
+ fic = open(file_mapping,'r')
47
+ for line in fic:
48
+ fields = line.strip().split('\t')
49
+ if len(fields)==3:
50
+ wotan_pos = fields[0]
51
+ kaf_pos = fields[1]
52
+ mapping_postag_to_kaf[wotan_pos] = kaf_pos
53
+ fic.close()
54
+ opener_pos = mapping_postag_to_kaf.get(pos,'O')
55
+ return opener_pos
56
+
57
+
58
+ if __name__=='__main__':
59
+
60
+ if sys.stdin.isatty():
61
+ print>>sys.stderr,'Input stream required.'
62
+ print>>sys.stderr,'Example usage: cat myUTF8file.kaf |',sys.argv[0]
63
+ sys.exit(-1)
64
+
65
+ time_stamp = True
66
+ try:
67
+ opts, args = getopt.getopt(sys.argv[1:],"l:",["no-time"])
68
+ for opt, arg in opts:
69
+ if opt == "--no-time":
70
+ time_stamp = False
71
+ except getopt.GetoptError:
72
+ pass
73
+
74
+
75
+ input_kaf = KafParser(sys.stdin)
76
+ my_lang = input_kaf.getLanguage()
77
+
78
+ if my_lang == 'nl':
79
+ pos_model= pos_model_nl
80
+ mapping_pos_filename= mapping_pos_filename_nl
81
+ elif my_lang =='de':
82
+ pos_model = pos_model_de
83
+ mapping_pos_filename = mapping_pos_filename_de
84
+ else:
85
+ print>>sys.stdout,'The language of the input KAF is "'+my_lang+'" and only can be Dutch (nl) or German (de)'
86
+ sys.exit(-1)
87
+
88
+
89
+
90
+
91
+
92
+ ## Create the input text for
93
+ reference_tokens = []
94
+ sentences = []
95
+ prev_sent='-200'
96
+ aux = []
97
+ for word, sent_id, w_id in input_kaf.getTokens():
98
+ if sent_id != prev_sent:
99
+ if len(aux) != 0:
100
+ sentences.append(aux)
101
+ aux = []
102
+ aux.append((word,w_id))
103
+
104
+ prev_sent = sent_id
105
+ if len(aux)!=0:
106
+ sentences.append(aux)
107
+
108
+ for sentence in sentences:
109
+ text = ' '.join(t for t,_ in sentence).encode('utf-8')
110
+ cmd = [os.path.join(opennlp_folder,'bin/opennlp'), 'POSTagger',os.path.join(model_folder,pos_model)]
111
+ try:
112
+ proc = subprocess.Popen(cmd,stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
113
+ proc.stdin.write(text)
114
+ proc.stdin.close()
115
+ text_with_pos = proc.stdout.read().strip().decode('utf-8') ## variable is unicode
116
+ proc.terminate()
117
+
118
+ except Exception as e:
119
+ print>>sys.stderr,str(e)
120
+ sys.exit(-1)
121
+
122
+ data = {}
123
+ new_tokens = []
124
+ for n, token in enumerate(text_with_pos.split(' ')):
125
+ position = token.rfind('_')
126
+ lemma = token[:position]
127
+ pos = token[position+1:]
128
+ my_id='t_'+str(n)
129
+ data[my_id] = (lemma,pos)
130
+ new_tokens.append((lemma,my_id))
131
+
132
+ mapping_tokens = {}
133
+ token_matcher(sentence,new_tokens,mapping_tokens)
134
+ for token_new,id_new in new_tokens:
135
+ lemma,pos = data[id_new]
136
+ opener_pos = map_pos_tag(pos)
137
+ if opener_pos in ['N','R','G','V','A','O']:
138
+ type_term = 'open'
139
+ else:
140
+ type_term = 'close'
141
+ ele_term = EL('term',attrib={'tid':id_new,
142
+ 'type':type_term,
143
+ 'pos':opener_pos,
144
+ 'morphofeat':pos,
145
+ 'lemma':lemma})
146
+ ref_tokens = mapping_tokens[id_new]
147
+ ele_span = EL('span')
148
+ for ref_token in ref_tokens:
149
+ eleTarget = EL('target',attrib={'id':ref_token})
150
+ ele_span.append(eleTarget)
151
+ ele_term.append(ele_span)
152
+
153
+ input_kaf.addElementToLayer('terms', ele_term)
154
+
155
+ input_kaf.addLinguisticProcessor('Open nlp pos tagger','1.0', 'term', time_stamp)
156
+ input_kaf.saveToFile(sys.stdout)
157
+ sys.exit(0)
158
+
159
+
160
+