opener-pos-tagger-base 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +110 -0
  3. data/bin/pos-tagger-base +21 -0
  4. data/core/mapping.postag.stss.to.opener.csv +52 -0
  5. data/core/mapping.postag.wotan.to.opener.csv +13 -0
  6. data/core/opennlp/bin/opennlp +35 -0
  7. data/core/opennlp/bin/opennlp.bat +35 -0
  8. data/core/opennlp/lib/jwnl-1.3.3.jar +0 -0
  9. data/core/opennlp/lib/opennlp-maxent-3.0.2-incubating.jar +0 -0
  10. data/core/opennlp/lib/opennlp-tools-1.5.2-incubating.jar +0 -0
  11. data/core/opennlp/lib/opennlp-uima-1.5.2-incubating.jar +0 -0
  12. data/core/opennlp/models/de-pos-maxent.bin +0 -0
  13. data/core/opennlp/models/de-pos-perceptron.bin +0 -0
  14. data/core/opennlp/models/nl-pos-maxent.bin +0 -0
  15. data/core/opennlp/models/nl-pos-perceptron.bin +0 -0
  16. data/core/pos-tagger_open-nlp.py +160 -0
  17. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  18. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  19. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  20. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  21. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  22. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  23. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  24. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  25. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  26. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  27. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  28. data/core/token_matcher.py +80 -0
  29. data/ext/hack/support.rb +38 -0
  30. data/lib/opener/pos_taggers/base.rb +90 -0
  31. data/lib/opener/pos_taggers/base/version.rb +7 -0
  32. data/opener-pos-tagger-base.gemspec +29 -0
  33. data/pre_build_requirements.txt +1 -0
  34. metadata +132 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e1d01b280c3f2369e20c811fa11a42150b41cc16
4
+ data.tar.gz: 7639fb3ce4fb64641047659339b500157940087c
5
+ SHA512:
6
+ metadata.gz: 31dd9808cc4b3ce95de10c8e95c456af963a2b93cc1bbef60e9917716b9de830ce3b749c28b63017ab7ce90393172cf499563400fbb62873a39eb0be2d0e2f1a
7
+ data.tar.gz: 7b9ab3549277fc1c93b09b60eae00f9b56894dee3565550f9eee7d82209316d770083df32085885282da18a3c12475b1e4c897f9c8fd928f2c144841bea88e3d
@@ -0,0 +1,110 @@
1
+ [![Build Status](https://drone.io/github.com/opener-project/pos-tagger-base/status.png)](https://drone.io/github.com/opener-project/pos-tagger-base/latest)
2
+
3
+ # Base POS Tagger
4
+
5
+ This repository contains the source code (both Ruby and Python) for the base
6
+ POS tagger. Currently this tagger supports the following languages:
7
+
8
+ * Dutch
9
+ * German
10
+
11
+ ## Requirements
12
+
13
+ * Python 2.7.0 or newer
14
+ * Ruby 1.9.2 or newer
15
+ * pip
16
+ * libxml2
17
+
18
+ ## Installation
19
+
20
+ Using Bundler:
21
+
22
+ gem 'opener-pos-tagger-base',
23
+ :git => 'git@github.com:opener-project/pos-tagger-base.git',
24
+ :branch => 'master'
25
+
26
+ Using `specific_install`:
27
+
28
+ gem install specific_install
29
+ gem specific_install opener-pos-tagger-base \
30
+ -l https://github.com/opener-project/pos-tagger-base.git
31
+
32
+ Using regular RubyGems (once the Gem is available):
33
+
34
+ gem install opener-pos-tagger-base
35
+
36
+ ## Usage
37
+
38
+ Tagging a KAF file:
39
+
40
+ cat some_input_file.kaf | pos-tagger-base
41
+
42
+ ## Contributing
43
+
44
+ First make sure all the required dependencies are installed:
45
+
46
+ bundle install
47
+
48
+ Then download the required Python code:
49
+
50
+ bundle exec rake compile
51
+
52
+ Once this is done continue reading the sections below to get a better
53
+ understanding about the repository structure.
54
+
55
+ ## Structure
56
+
57
+ This repository comes in two parts: a collection of Python source files and
58
+ Ruby source code. The Python code can be found in `core/`, the Ruby code can be
59
+ found in the other directories (e.g. `lib/`).
60
+
61
+ Required Python packages are installed locally in to `core/site-packages/X`
62
+ where X is one of the following two:
63
+
64
+ * `pre_build`: contains packages that are installed before building the Gem,
65
+ these packages are shipped with the Gem
66
+ * `pre_install`: contains packages that are installed in to this directory upon
67
+ installing the Gem. This directory should exclusively be used for compiled
68
+ Python packages such as lxml.
69
+
70
+ There are also two requirements files for pip:
71
+
72
+ * `pre_build_requirements.txt`: installs the requirements for the `pre_build`
73
+ directory.
74
+ * `pre_install_requirements.txt`: installs the requirements for the
75
+ `pre_install` directory.
76
+
77
+ To easily install all the required dependencies (required for running the tests
78
+ for example) run the following:
79
+
80
+ bundle exec rake compile
81
+
82
+ This will take care of verifying the requirements and downloading and
83
+ installing the Python packages.
84
+
85
+ ## Testing
86
+
87
+ To run the tests (which are powered by Cucumber), simply run the following:
88
+
89
+ bundle exec rake
90
+
91
+ This will take care of verifying the requirements, installing the Python code
92
+ and running the tests.
93
+
94
+ For more information on the available Rake tasks run the following:
95
+
96
+ bundle exec rake -T
97
+
98
+ ## POS Details
99
+
100
+ ### POS-tags models
101
+
102
+ * [Dutch-maxent](http://opennlp.sourceforge.net/models-1.5/nl-pos-maxent.bin)
103
+ * [Dutch-perceptron](http://opennlp.sourceforge.net/models-1.5/nl-pos-perceptron.bin)
104
+ * [German-maxent](http://opennlp.sourceforge.net/models-1.5/de-pos-maxent.bin)
105
+ * [German-perceptron](http://opennlp.sourceforge.net/models-1.5/de-pos-perceptron.bin)
106
+
107
+ ### POS-tags sets
108
+
109
+ * Dutch: trained on conllx alpino data, wotan tagset
110
+ * German: trained on TIGER corpus, STSS tagset
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/base'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::Base.new(:args => ARGV)
13
+ stdout, stderr, process = kernel.run(input)
14
+
15
+ if process.success?
16
+ puts stdout
17
+
18
+ STDERR.puts(stderr) unless stderr.empty?
19
+ else
20
+ abort stderr
21
+ end
@@ -0,0 +1,52 @@
1
+ ADJA G ("Attributives Adjektiv"),
2
+ ADJD G ("Adverbiales oder pr�dikatives Adjektiv"),
3
+ ADV A ("Adverb"),
4
+ APPR P ("Pr�position; Zirkumposition links"),
5
+ APPRART P ("Pr�position mit Artikel"),
6
+ APPO P ("Postposition"),
7
+ APZR P ("Zirkumposition rechts"),
8
+ ART D ("Bestimmer oder unbestimmer Artikel"),
9
+ CARD O ("Kardinalzahl"),
10
+ FM O ("Fremdsprachichles Material"),
11
+ ITJ O ("Interjektion"),
12
+ KOUI C ("unterordnende Konjunktion mit zu und Infinitiv"),
13
+ KOUS C ("unterordnende Konjunktion mit Satz"),
14
+ KON C ("nebenordnende Konjunktion"),
15
+ KOKOM C ("Vergleichskonjunktion"),
16
+ NN N ("normales Nomen"),
17
+ NE R ("Eigennamen"),
18
+ PDS Q ("substituierendes Demonstrativpronomen"),
19
+ PDAT Q ("attribuierendes Demonstrativpronomen"),
20
+ PIS Q ("substituierendes Indefinitpronomen"),
21
+ PIAT Q ("attribuierendes Indefinitpronomen ohne Determiner"),
22
+ PIDAT Q ("attribuierendes Indefinitpronomen mit Determiner"),
23
+ PPER Q ("irreflexives Personalpronomen"),
24
+ PPOSS Q ("substituierendes Possessivpronomen"),
25
+ PPOSAT Q ("attribuierendes Possessivpronomen"),
26
+ PRELS Q ("substituierendes Relativpronomen"),
27
+ PRELAT Q ("attribuierendes Relativpronomen"),
28
+ PRF Q ("reflexives Personalpronomen"),
29
+ PWS Q ("substituierendes Interrogativpronomen"),
30
+ PWAT Q ("attribuierendes Interrogativpronomen"),
31
+ PWAV Q ("adverbiales Interrogativ- oder Relativpronomen"),
32
+ PAV Q ("Pronominaladverb"),
33
+ PTKZU O ("zu vor Infinitiv"),
34
+ PTKNEG O ("Negationspartike"),
35
+ PTKVZ V ("abgetrennter Verbzusatz"),
36
+ PTKANT O ("Antwortpartikel"),
37
+ PTKA O ("Partikel bei Adjektiv oder Adverb"),
38
+ TRUNC N ("Kompositions-Erstglied"),
39
+ VVFIN V ("finites Verb, voll"),
40
+ VVIMP V ("Imperativ, voll"),
41
+ VVINF V ("Infinitiv"),
42
+ VVIZU V ("Infinitiv mit zu"),
43
+ VVPP V ("Partizip Perfekt"),
44
+ VAFIN V ("finites Verb, aux"),
45
+ VAIMP V ("Imperativ, aux"),
46
+ VAINF V ("Infinitiv, aux"),
47
+ VAPP V ("Partizip Perfekt"),
48
+ VMFIN V ("finites Verb, modal"),
49
+ VMINF V ("Infinitiv, modal"),
50
+ VMPP V ("Partizip Perfekt, modal"),
51
+ XY O ("Nichtwort, Sonderzeichen"),
52
+ UNDEFINED O ("Nicht definiert, zb. Satzzeichen");
@@ -0,0 +1,13 @@
1
+ Adj G Adjective
2
+ Adv A Adverb
3
+ Art D Article determiner
4
+ Conj C Conjunction
5
+ Int O Interjection
6
+ N N Noun
7
+ Num O Numeral
8
+ Misc O Miscelaneous
9
+ Prep P Preposition
10
+ Pron Q Pronoun
11
+ Punc O Punctuation
12
+ V V Verb
13
+
@@ -0,0 +1,35 @@
1
+ #!/bin/sh
2
+
3
+ # Licensed to the Apache Software Foundation (ASF) under one
4
+ # or more contributor license agreements. See the NOTICE file
5
+ # distributed with this work for additional information
6
+ # regarding copyright ownership. The ASF licenses this file
7
+ # to you under the Apache License, Version 2.0 (the
8
+ # "License"); you may not use this file except in compliance
9
+ # with the License. You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing,
14
+ # software distributed under the License is distributed on an
15
+ # # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
+ # KIND, either express or implied. See the License for the
17
+ # specific language governing permissions and limitations
18
+ # under the License.
19
+
20
+ # Note: Do not output anything in this script file, any output
21
+ # may be inadvertantly placed in any output files if
22
+ # output redirection is used.
23
+
24
+ if [ -z "$JAVACMD" ] ; then
25
+ if [ -n "$JAVA_HOME" ] ; then
26
+ JAVACMD="$JAVA_HOME/bin/java"
27
+ else
28
+ JAVACMD="`which java`"
29
+ fi
30
+ fi
31
+
32
+ # Might fail if $0 is a link
33
+ OPENNLP_HOME=`dirname "$0"`/..
34
+
35
+ $JAVACMD -Xmx1024m -jar $OPENNLP_HOME/lib/opennlp-tools-*.jar $@
@@ -0,0 +1,35 @@
1
+ @ECHO off
2
+
3
+ REM # Licensed to the Apache Software Foundation (ASF) under one
4
+ REM # or more contributor license agreements. See the NOTICE file
5
+ REM # distributed with this work for additional information
6
+ REM # regarding copyright ownership. The ASF licenses this file
7
+ REM # to you under the Apache License, Version 2.0 (the
8
+ REM # "License"); you may not use this file except in compliance
9
+ REM # with the License. You may obtain a copy of the License at
10
+ REM #
11
+ REM # http://www.apache.org/licenses/LICENSE-2.0
12
+ REM #
13
+ REM # Unless required by applicable law or agreed to in writing,
14
+ REM # software distributed under the License is distributed on an
15
+ REM # # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
+ REM # KIND, either express or implied. See the License for the
17
+ REM # specific language governing permissions and limitations
18
+ REM # under the License.
19
+
20
+ REM # Note: Do not output anything in this script file, any output
21
+ REM # may be inadvertantly placed in any output files if
22
+ REM # output redirection is used.
23
+
24
+ IF "%JAVA_CMD%" == "" (
25
+ IF "%JAVA_HOME%" == "" (
26
+ SET JAVA_CMD=java
27
+ ) ELSE (
28
+ SET JAVA_CMD=%JAVA_HOME%\bin\java
29
+ )
30
+ )
31
+
32
+ REM # Should work with Windows XP and greater. If not, specify the path to where it is installed.
33
+ IF "%OPENNLP_HOME%" == "" SET OPENNLP_HOME=%~sp0..
34
+
35
+ %JAVA_CMD% -Xmx4096m -jar %OPENNLP_HOME%\lib\opennlp-tools-*.jar %*
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/env python
2
+ #-*- coding: utf-8 *-*
3
+ # Ruben Izquierdo
4
+ # Vrije University of Amsterdam
5
+
6
+ import os
7
+ import sys
8
+ import operator
9
+ import time
10
+ import getopt
11
+ import string
12
+ import subprocess
13
+
14
+ os.environ["LC_CTYPE"] = 'en_US.UTF-8'
15
+
16
+ this_folder = os.path.dirname(os.path.realpath(__file__))
17
+ opennlp_folder = os.path.join(this_folder, 'opennlp')
18
+ model_folder = os.path.join(opennlp_folder, 'models')
19
+
20
+ # This updates the load path to ensure that the local site-packages directory
21
+ # can be used to load packages (e.g. a locally installed copy of lxml).
22
+ sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
23
+ sys.path.append(os.path.join(this_folder, 'site-packages/pre_install'))
24
+
25
+ # Config for Dutch
26
+ pos_model_nl = 'nl-pos-maxent.bin'
27
+ mapping_pos_filename_nl = 'mapping.postag.wotan.to.opener.csv'
28
+
29
+ # Config for German
30
+ pos_model_de = 'de-pos-maxent.bin'
31
+ mapping_pos_filename_de = 'mapping.postag.stss.to.opener.csv'
32
+
33
+ mapping_postag_to_kaf = None
34
+ mapping_pos_filename = ""
35
+ __version__ = '2-May-2013'
36
+
37
+ from lxml.etree import ElementTree as ET, Element as EL, PI
38
+ from VUKafParserPy.KafParserMod import KafParser
39
+ from token_matcher import token_matcher
40
+
41
+ def map_pos_tag(pos):
42
+ global mapping_postag_to_kaf
43
+ if mapping_postag_to_kaf is None:
44
+ mapping_postag_to_kaf = {}
45
+ file_mapping = os.path.join(this_folder,mapping_pos_filename)
46
+ fic = open(file_mapping,'r')
47
+ for line in fic:
48
+ fields = line.strip().split('\t')
49
+ if len(fields)==3:
50
+ wotan_pos = fields[0]
51
+ kaf_pos = fields[1]
52
+ mapping_postag_to_kaf[wotan_pos] = kaf_pos
53
+ fic.close()
54
+ opener_pos = mapping_postag_to_kaf.get(pos,'O')
55
+ return opener_pos
56
+
57
+
58
+ if __name__=='__main__':
59
+
60
+ if sys.stdin.isatty():
61
+ print>>sys.stderr,'Input stream required.'
62
+ print>>sys.stderr,'Example usage: cat myUTF8file.kaf |',sys.argv[0]
63
+ sys.exit(-1)
64
+
65
+ time_stamp = True
66
+ try:
67
+ opts, args = getopt.getopt(sys.argv[1:],"l:",["no-time"])
68
+ for opt, arg in opts:
69
+ if opt == "--no-time":
70
+ time_stamp = False
71
+ except getopt.GetoptError:
72
+ pass
73
+
74
+
75
+ input_kaf = KafParser(sys.stdin)
76
+ my_lang = input_kaf.getLanguage()
77
+
78
+ if my_lang == 'nl':
79
+ pos_model= pos_model_nl
80
+ mapping_pos_filename= mapping_pos_filename_nl
81
+ elif my_lang =='de':
82
+ pos_model = pos_model_de
83
+ mapping_pos_filename = mapping_pos_filename_de
84
+ else:
85
+ print>>sys.stdout,'The language of the input KAF is "'+my_lang+'" and only can be Dutch (nl) or German (de)'
86
+ sys.exit(-1)
87
+
88
+
89
+
90
+
91
+
92
+ ## Create the input text for
93
+ reference_tokens = []
94
+ sentences = []
95
+ prev_sent='-200'
96
+ aux = []
97
+ for word, sent_id, w_id in input_kaf.getTokens():
98
+ if sent_id != prev_sent:
99
+ if len(aux) != 0:
100
+ sentences.append(aux)
101
+ aux = []
102
+ aux.append((word,w_id))
103
+
104
+ prev_sent = sent_id
105
+ if len(aux)!=0:
106
+ sentences.append(aux)
107
+
108
+ for sentence in sentences:
109
+ text = ' '.join(t for t,_ in sentence).encode('utf-8')
110
+ cmd = [os.path.join(opennlp_folder,'bin/opennlp'), 'POSTagger',os.path.join(model_folder,pos_model)]
111
+ try:
112
+ proc = subprocess.Popen(cmd,stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
113
+ proc.stdin.write(text)
114
+ proc.stdin.close()
115
+ text_with_pos = proc.stdout.read().strip().decode('utf-8') ## variable is unicode
116
+ proc.terminate()
117
+
118
+ except Exception as e:
119
+ print>>sys.stderr,str(e)
120
+ sys.exit(-1)
121
+
122
+ data = {}
123
+ new_tokens = []
124
+ for n, token in enumerate(text_with_pos.split(' ')):
125
+ position = token.rfind('_')
126
+ lemma = token[:position]
127
+ pos = token[position+1:]
128
+ my_id='t_'+str(n)
129
+ data[my_id] = (lemma,pos)
130
+ new_tokens.append((lemma,my_id))
131
+
132
+ mapping_tokens = {}
133
+ token_matcher(sentence,new_tokens,mapping_tokens)
134
+ for token_new,id_new in new_tokens:
135
+ lemma,pos = data[id_new]
136
+ opener_pos = map_pos_tag(pos)
137
+ if opener_pos in ['N','R','G','V','A','O']:
138
+ type_term = 'open'
139
+ else:
140
+ type_term = 'close'
141
+ ele_term = EL('term',attrib={'tid':id_new,
142
+ 'type':type_term,
143
+ 'pos':opener_pos,
144
+ 'morphofeat':pos,
145
+ 'lemma':lemma})
146
+ ref_tokens = mapping_tokens[id_new]
147
+ ele_span = EL('span')
148
+ for ref_token in ref_tokens:
149
+ eleTarget = EL('target',attrib={'id':ref_token})
150
+ ele_span.append(eleTarget)
151
+ ele_term.append(ele_span)
152
+
153
+ input_kaf.addElementToLayer('terms', ele_term)
154
+
155
+ input_kaf.addLinguisticProcessor('Open nlp pos tagger','1.0', 'term', time_stamp)
156
+ input_kaf.saveToFile(sys.stdout)
157
+ sys.exit(0)
158
+
159
+
160
+