opener-pos-tagger-en-es 2.0.2 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 89157ca3c263b71b58644e6d5affb554ba67bf68
4
- data.tar.gz: b61b257b2e6af19487d7c477f6ab9c8573260d5c
3
+ metadata.gz: cf8e78b3be6d89b98ddc381d07ad70f5e0a454e4
4
+ data.tar.gz: 4c92415365aacfa409539d9257ecec6749b647a7
5
5
  SHA512:
6
- metadata.gz: a78849a4f64d09070a073b25f433e345539e7fed6528720aae83113db9ad3565953594051135f7e3e4f41a2957f5dd63048c2ae67a20d3e37a78f1bcc4f05dde
7
- data.tar.gz: 2835d0aae655bc7b5d364f97f476acc2786e73a78a48862b705445631c435d30c5e5a48a1e0bbb20f16622c3b59ae10d4b3fbe885b107f90cc9f8e163ab25f8c
6
+ metadata.gz: 13e994b170f0519cce76392d7e37d989c766796b1d4760127e41103974a9625ce71dece012166ef58c3eae0640f713d115150ea6a31831e3508c3911d11ed163
7
+ data.tar.gz: c8a2f87ad1be16242ae300d5b61f8b730ab1bcbce55419c84668e7261fba2a0316066f3c336f5a5ea4cfaf9bc32df0b6c0410b0e1e1ff684e49c52aa13eef996
@@ -0,0 +1,13 @@
1
+ Copyright 2014 OpeNER Project Consortium
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/README.md CHANGED
@@ -2,20 +2,25 @@
2
2
 
3
3
  # English, Spanish, Dutch, Italian, French POS Tagger
4
4
 
5
- **Modified to also do Dutch**
6
-
7
5
  This repository contains the source code for the English & Spanish POS tagger of the
8
6
  OpeNER project.
9
7
 
10
- English perceptron models have been trained and evaluated using the WSJ
11
- treebank as explained in K. Toutanova, D. Klein, and C. D. Manning.
12
- Feature-rich part-of-speech tagging with a cyclic dependency network. In
13
- Proceedings of HLT-NAACL’03, 2003. Currently we obtain a performance of 96.48%
14
- vs 97.24% obtained by Toutanova et al. (2003).
8
+ * English perceptron models have been trained and evaluated using the WSJ
9
+ treebank as explained in K. Toutanova, D. Klein, and C. D. Manning.
10
+ Feature-rich part-of-speech tagging with a cyclic dependency network. In
11
+ Proceedings of HLT-NAACL’03, 2003. Currently we obtain a performance of 96.87%
12
+ vs 97.24% obtained by Toutanova et al. (2003).
13
+
14
+ * Spanish Maximum Entropy models have been trained and evaluated using the Ancora
15
+ corpus; it was randomly divided in 90% for training (450K words) and 10% testing
16
+ (50K words), obtaining a performance of 98.88%.
17
+
18
+ * French Maximum Entropy models trained with the ESTER corpus.
19
+
20
+ * Italian Perceptron models trained with the TUT Treebank.
15
21
 
16
- Spanish Maximum Entropy models have been trained and evaluated using the Ancora
17
- corpus; it was randomly divided in 90% for training (440K words) and 10% testing
18
- (70K words), obtaining a performance of 98.88%.
22
+ * Dutch Perceptron model publicly available at Apache OpenNLP website:
23
+ (http://opennlp.sourceforge.net/models-1.5/)
19
24
 
20
25
  ## Requirements
21
26
 
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/opener/pos_taggers/en'
3
+ require_relative '../lib/opener/pos_taggers/en_es'
4
4
 
5
5
  # STDIN.tty? returns `false` if data is being piped into the current process.
6
6
  if STDIN.tty?
@@ -9,5 +9,6 @@ else
9
9
  input = STDIN.read
10
10
  end
11
11
 
12
- kernel = Opener::POSTaggers::EN.new(:args => ARGV)
12
+ kernel = Opener::POSTaggers::EN.new(:args => ARGV)
13
+
13
14
  puts kernel.run(input)
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/opener/pos_taggers/en'
3
+ require_relative '../lib/opener/pos_taggers/en_es'
4
4
 
5
5
  # STDIN.tty? returns `false` if data is being piped into the current process.
6
6
  if STDIN.tty?
@@ -10,4 +10,5 @@ else
10
10
  end
11
11
 
12
12
  kernel = Opener::POSTaggers::ES.new(:args => ARGV)
13
+
13
14
  puts kernel.run(input)
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/opener/pos_taggers/en'
3
+ require_relative '../lib/opener/pos_taggers/en_es'
4
4
 
5
5
  # STDIN.tty? returns `false` if data is being piped into the current process.
6
6
  if STDIN.tty?
@@ -9,5 +9,6 @@ else
9
9
  input = STDIN.read
10
10
  end
11
11
 
12
- kernel = Opener::POSTaggers::FR.new(:args => ARGV)
12
+ kernel = Opener::POSTaggers::FR.new(:args => ARGV)
13
+
13
14
  puts kernel.run(input)
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/opener/pos_taggers/en'
3
+ require_relative '../lib/opener/pos_taggers/en_es'
4
4
 
5
5
  # STDIN.tty? returns `false` if data is being piped into the current process.
6
6
  if STDIN.tty?
@@ -9,5 +9,6 @@ else
9
9
  input = STDIN.read
10
10
  end
11
11
 
12
- kernel = Opener::POSTaggers::IT.new(:args => ARGV)
12
+ kernel = Opener::POSTaggers::IT.new(:args => ARGV)
13
+
13
14
  puts kernel.run(input)
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/opener/pos_taggers/en'
3
+ require_relative '../lib/opener/pos_taggers/en_es'
4
4
 
5
5
  # STDIN.tty? returns `false` if data is being piped into the current process.
6
6
  if STDIN.tty?
@@ -9,5 +9,6 @@ else
9
9
  input = STDIN.read
10
10
  end
11
11
 
12
- kernel = Opener::POSTaggers::NL.new(:args => ARGV)
12
+ kernel = Opener::POSTaggers::NL.new(:args => ARGV)
13
+
13
14
  puts kernel.run(input)
Binary file
@@ -1,118 +1,3 @@
1
- require 'open3'
2
- require 'stringio'
3
-
4
- require 'java'
5
-
6
- require File.expand_path('../../../../core/target/ehu-pos-1.0.jar', __FILE__)
7
-
8
- import 'java.io.InputStreamReader'
9
- import 'ixa.kaflib.KAFDocument'
10
- import 'ehu.pos.Annotate'
11
- import 'ehu.pos.Resources'
12
- import 'ehu.lemmatize.MorfologikLemmatizer'
13
- #import 'ehu.lemmatize.Dictionary'
14
- import 'ehu.lemmatize.LemmatizerDispatcher'
15
-
16
- require_relative 'en/version'
17
-
18
- module Opener
19
- module POSTaggers
20
- ##
21
- # The POS tagger that supports English and Spanish.
22
- #
23
- # @!attribute [r] args
24
- # @return [Array]
25
- # @!attribute [r] options
26
- # @return [Hash]
27
- #
28
- class EN
29
- attr_reader :args, :options
30
-
31
- ##
32
- # The default language to use.
33
- #
34
- # @return [String]
35
- #
36
- DEFAULT_LANGUAGE = 'en'.freeze
37
-
38
- ##
39
- # @param [Hash] options
40
- #
41
- # @option options [Array] :args The commandline arguments to pass to the
42
- # underlying Python script.
43
- #
44
- def initialize(options = {})
45
- @args = options.delete(:args) || []
46
- @options = options
47
- end
48
-
49
- ##
50
- # Runs the command and returns the output of STDOUT, STDERR and the
51
- # process information.
52
- #
53
- # @param [String] input The input to tag.
54
- # @return [Array]
55
- #
56
- def run(input)
57
- input = StringIO.new(input) unless input.kind_of?(IO)
58
- reader = InputStreamReader.new(input.to_inputstream)
59
- kaf = KAFDocument.create_from_stream(reader)
60
- annotator = Java::ehu.pos.Annotate.new(language)
61
-
62
- kaf.addLinguisticProcessor("terms","ehu-pos-"+language,"now","1.0");
63
- annotator.annotatePOSToKAF(kaf, lemmatizer, language)
64
-
65
- return kaf.to_string
66
- end
67
-
68
- protected
69
-
70
- ##
71
- # Returns the lemmatizer to use.
72
- #
73
- def lemmatizer
74
- return LemmatizerDispatcher.obtainMorfologikLemmatizer(language)
75
- end
76
-
77
- ##
78
- # @return [String]
79
- #
80
- def language
81
- return options[:language] || DEFAULT_LANGUAGE
82
- end
83
- end # EN
84
-
85
- class ES < EN
86
- ##
87
- # @return [String]
88
- #
89
- def language
90
- return 'es'
91
- end
92
- end # ES
93
-
94
- class NL < EN
95
- def language
96
- return 'nl'
97
- end
98
- end # NL
99
-
100
- class IT < EN
101
- ##
102
- # @return [String]
103
- #
104
- def language
105
- return 'it'
106
- end
107
- end # IT
108
-
109
- class FR < EN
110
- ##
111
- # @return [String]
112
- #
113
- def language
114
- return 'fr'
115
- end
116
- end # FR
117
- end # POSTaggers
118
- end # Opener
1
+ # This file exists for backwards compatibility, it's recommended to require
2
+ # en_es.rb instead.
3
+ require_relative 'en_es'
@@ -0,0 +1,19 @@
1
+ require 'open3'
2
+ require 'stringio'
3
+ require 'nokogiri'
4
+
5
+ require File.expand_path('../../../../core/target/ehu-pos-1.0.jar', __FILE__)
6
+
7
+ # import 'java.io.InputStreamReader'
8
+ # import 'ixa.kaflib.KAFDocument'
9
+ # import 'ehu.pos.Resources'
10
+ # import 'ehu.lemmatize.MorfologikLemmatizer'
11
+ # import 'ehu.lemmatize.LemmatizerDispatcher'
12
+
13
+ require_relative 'en_es/version'
14
+ require_relative 'en_es/en_es'
15
+ require_relative 'en_es/en'
16
+ require_relative 'en_es/es'
17
+ require_relative 'en_es/nl'
18
+ require_relative 'en_es/it'
19
+ require_relative 'en_es/fr'
@@ -0,0 +1,16 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # English POS tagger class. This class forces the language to English
5
+ # regardless of what the KAF document claims the language to be.
6
+ #
7
+ class EN < EnEs
8
+ ##
9
+ # @see [Opener::POSTaggers::Base#language_from_kaf]
10
+ #
11
+ def language_from_kaf(input)
12
+ return 'en'
13
+ end
14
+ end # EN
15
+ end # POSTaggers
16
+ end # Opener
@@ -0,0 +1,95 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # Base POS tagger class for the various language specific ones such as
5
+ # {OpeneR::POSTaggers::FR}.
6
+ #
7
+ # @!attribute [r] args
8
+ # @return [Array]
9
+ #
10
+ # @!attribute [r] options
11
+ # @return [Hash]
12
+ #
13
+ class EnEs
14
+ attr_reader :args, :options
15
+
16
+ ##
17
+ # The default options to use.
18
+ #
19
+ # @return [Hash]
20
+ #
21
+ DEFAULT_OPTIONS = {
22
+ :enable_time => true
23
+ }
24
+
25
+ ##
26
+ # @param [Hash] options
27
+ #
28
+ # @option options [Array] :args
29
+ #
30
+ # @option options [TrueClass|FalseClass] :enable_time When set to `true`
31
+ # (default) dynamic timestamps will be added.
32
+ #
33
+ def initialize(options = {})
34
+ @args = options.delete(:args) || []
35
+ @options = DEFAULT_OPTIONS.merge(options)
36
+ end
37
+
38
+ ##
39
+ # Runs the command and returns the resulting KAF document.
40
+ #
41
+ # @param [String] input The input to tag.
42
+ # @return [Array]
43
+ #
44
+ def run(input)
45
+ language = language_from_kaf(input)
46
+ input = StringIO.new(input)
47
+
48
+ reader = Java::java.io.InputStreamReader.new(input.to_inputstream)
49
+ kaf = Java::ixa.kaflib.KAFDocument.create_from_stream(reader)
50
+ annotator = new_annotator(language)
51
+
52
+ annotator.annotatePOSToKAF(kaf, lemmatizer(language), language)
53
+
54
+ return kaf.to_string
55
+ end
56
+
57
+ protected
58
+
59
+ ##
60
+ # Creates and configures a new annotator instance.
61
+ #
62
+ # @param [String] language
63
+ # @return [Java::ehy.pos.Annotate]
64
+ #
65
+ def new_annotator(language)
66
+ annotator = Java::ehu.pos.Annotate.new(language)
67
+
68
+ annotator.disableTimestamp unless options[:enable_time]
69
+
70
+ return annotator
71
+ end
72
+
73
+ ##
74
+ # Returns the lemmatizer to use.
75
+ #
76
+ # @param [String] language
77
+ #
78
+ def lemmatizer(language)
79
+ return Java::ehu.lemmatize.LemmatizerDispatcher.obtainMorfologikLemmatizer(language)
80
+ end
81
+
82
+ ##
83
+ # Returns the language for the given KAF document.
84
+ #
85
+ # @param [String] input
86
+ # @return [String]
87
+ #
88
+ def language_from_kaf(input)
89
+ document = Nokogiri::XML(input)
90
+
91
+ return document.at('KAF').attr('xml:lang')
92
+ end
93
+ end # Base
94
+ end # POSTaggers
95
+ end # Opener
@@ -0,0 +1,16 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # Spanish POS tagger class. This class forces the language to Spanish
5
+ # regardless of what the KAF document claims the language to be.
6
+ #
7
+ class ES < EnEs
8
+ ##
9
+ # @see [Opener::POSTaggers::Base#language_from_kaf]
10
+ #
11
+ def language_from_kaf(input)
12
+ return 'es'
13
+ end
14
+ end # ES
15
+ end # POSTaggers
16
+ end # Opener
@@ -0,0 +1,16 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # French POS tagger class. This class forces the language to French
5
+ # regardless of what the KAF document claims the language to be.
6
+ #
7
+ class FR < EnEs
8
+ ##
9
+ # @see [Opener::POSTaggers::Base#language_from_kaf]
10
+ #
11
+ def language_from_kaf(input)
12
+ return 'fr'
13
+ end
14
+ end # FR
15
+ end # POSTaggers
16
+ end # Opener
@@ -0,0 +1,16 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # Italian POS tagger class. This class forces the language to Italian
5
+ # regardless of what the KAF document claims the language to be.
6
+ #
7
+ class IT < EnEs
8
+ ##
9
+ # @see [Opener::POSTaggers::Base#language_from_kaf]
10
+ #
11
+ def language_from_kaf(input)
12
+ return 'it'
13
+ end
14
+ end # IT
15
+ end # POSTaggers
16
+ end # Opener
@@ -0,0 +1,16 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # Dutch POS tagger class. This class forces the language to Dutch
5
+ # regardless of what the KAF document claims the language to be.
6
+ #
7
+ class NL < EnEs
8
+ ##
9
+ # @see [Opener::POSTaggers::Base#language_from_kaf]
10
+ #
11
+ def language_from_kaf(input)
12
+ return 'nl'
13
+ end
14
+ end # NL
15
+ end # POSTaggers
16
+ end # Opener
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  module POSTaggers
3
- class EN
4
- VERSION = "2.0.2"
3
+ class EnEs
4
+ VERSION = "2.0.3"
5
5
  end
6
6
  end
7
7
  end
@@ -1,25 +1,30 @@
1
- require File.expand_path('../lib/opener/pos_taggers/en/version', __FILE__)
1
+ require File.expand_path('../lib/opener/pos_taggers/en_es/version', __FILE__)
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = "opener-pos-tagger-en-es"
5
- gem.version = Opener::POSTaggers::EN::VERSION
5
+ gem.version = Opener::POSTaggers::EnEs::VERSION
6
6
  gem.authors = ["development@olery.com"]
7
7
  gem.summary = 'POS tagging for English and Spanish'
8
8
  gem.description = gem.summary
9
9
  gem.homepage = "http://opener-project.github.com/"
10
10
  gem.has_rdoc = "yard"
11
11
 
12
+ gem.license = 'Apache 2.0'
13
+
12
14
  gem.required_ruby_version = ">= 1.9.2"
13
15
 
14
16
  gem.files = Dir.glob([
15
17
  'core/target/ehu-pos-*.jar',
16
18
  'lib/**/*',
17
19
  '*.gemspec',
18
- 'README.md'
20
+ 'README.md',
21
+ 'LICENSE.txt'
19
22
  ]).select { |file| File.file?(file) }
20
23
 
21
24
  gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
22
25
 
26
+ gem.add_dependency 'nokogiri'
27
+
23
28
  gem.add_development_dependency 'rspec', '~> 3.0'
24
29
  gem.add_development_dependency 'cucumber'
25
30
  gem.add_development_dependency 'rake'
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-pos-tagger-en-es
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 2.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-17 00:00:00.000000000 Z
11
+ date: 2014-06-30 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ requirement: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - '>='
23
+ - !ruby/object:Gem::Version
24
+ version: '0'
25
+ prerelease: false
26
+ type: :runtime
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: rspec
15
29
  version_requirements: !ruby/object:Gem::Requirement
@@ -79,16 +93,25 @@ extra_rdoc_files: []
79
93
  files:
80
94
  - core/target/ehu-pos-1.0.jar
81
95
  - lib/opener/pos_taggers/en.rb
82
- - lib/opener/pos_taggers/en/version.rb
96
+ - lib/opener/pos_taggers/en_es.rb
97
+ - lib/opener/pos_taggers/en_es/nl.rb
98
+ - lib/opener/pos_taggers/en_es/fr.rb
99
+ - lib/opener/pos_taggers/en_es/en.rb
100
+ - lib/opener/pos_taggers/en_es/it.rb
101
+ - lib/opener/pos_taggers/en_es/version.rb
102
+ - lib/opener/pos_taggers/en_es/en_es.rb
103
+ - lib/opener/pos_taggers/en_es/es.rb
83
104
  - opener-pos-tagger-en-es.gemspec
84
105
  - README.md
106
+ - LICENSE.txt
85
107
  - bin/pos-tagger-nl
86
108
  - bin/pos-tagger-en
87
109
  - bin/pos-tagger-fr
88
110
  - bin/pos-tagger-es
89
111
  - bin/pos-tagger-it
90
112
  homepage: http://opener-project.github.com/
91
- licenses: []
113
+ licenses:
114
+ - Apache 2.0
92
115
  metadata: {}
93
116
  post_install_message:
94
117
  rdoc_options: []