opener-pos-tagger-en-es 2.0.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 89157ca3c263b71b58644e6d5affb554ba67bf68
4
- data.tar.gz: b61b257b2e6af19487d7c477f6ab9c8573260d5c
3
+ metadata.gz: cf8e78b3be6d89b98ddc381d07ad70f5e0a454e4
4
+ data.tar.gz: 4c92415365aacfa409539d9257ecec6749b647a7
5
5
  SHA512:
6
- metadata.gz: a78849a4f64d09070a073b25f433e345539e7fed6528720aae83113db9ad3565953594051135f7e3e4f41a2957f5dd63048c2ae67a20d3e37a78f1bcc4f05dde
7
- data.tar.gz: 2835d0aae655bc7b5d364f97f476acc2786e73a78a48862b705445631c435d30c5e5a48a1e0bbb20f16622c3b59ae10d4b3fbe885b107f90cc9f8e163ab25f8c
6
+ metadata.gz: 13e994b170f0519cce76392d7e37d989c766796b1d4760127e41103974a9625ce71dece012166ef58c3eae0640f713d115150ea6a31831e3508c3911d11ed163
7
+ data.tar.gz: c8a2f87ad1be16242ae300d5b61f8b730ab1bcbce55419c84668e7261fba2a0316066f3c336f5a5ea4cfaf9bc32df0b6c0410b0e1e1ff684e49c52aa13eef996
@@ -0,0 +1,13 @@
1
+ Copyright 2014 OpeNER Project Consortium
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/README.md CHANGED
@@ -2,20 +2,25 @@
2
2
 
3
3
  # English, Spanish, Dutch, Italian, French POS Tagger
4
4
 
5
- **Modified to also do Dutch**
6
-
7
5
  This repository contains the source code for the English & Spanish POS tagger of the
8
6
  OpeNER project.
9
7
 
10
- English perceptron models have been trained and evaluated using the WSJ
11
- treebank as explained in K. Toutanova, D. Klein, and C. D. Manning.
12
- Feature-rich part-of-speech tagging with a cyclic dependency network. In
13
- Proceedings of HLT-NAACL’03, 2003. Currently we obtain a performance of 96.48%
14
- vs 97.24% obtained by Toutanova et al. (2003).
8
+ * English perceptron models have been trained and evaluated using the WSJ
9
+ treebank as explained in K. Toutanova, D. Klein, and C. D. Manning.
10
+ Feature-rich part-of-speech tagging with a cyclic dependency network. In
11
+ Proceedings of HLT-NAACL’03, 2003. Currently we obtain a performance of 96.87%
12
+ vs 97.24% obtained by Toutanova et al. (2003).
13
+
14
+ * Spanish Maximum Entropy models have been trained and evaluated using the Ancora
15
+ corpus; it was randomly divided in 90% for training (450K words) and 10% testing
16
+ (50K words), obtaining a performance of 98.88%.
17
+
18
+ * French Maximum Entropy models trained with the ESTER corpus.
19
+
20
+ * Italian Perceptron models trained with the TUT Treebank.
15
21
 
16
- Spanish Maximum Entropy models have been trained and evaluated using the Ancora
17
- corpus; it was randomly divided in 90% for training (440K words) and 10% testing
18
- (70K words), obtaining a performance of 98.88%.
22
+ * Dutch Perceptron model publicly available at Apache OpenNLP website:
23
+ (http://opennlp.sourceforge.net/models-1.5/)
19
24
 
20
25
  ## Requirements
21
26
 
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/opener/pos_taggers/en'
3
+ require_relative '../lib/opener/pos_taggers/en_es'
4
4
 
5
5
  # STDIN.tty? returns `false` if data is being piped into the current process.
6
6
  if STDIN.tty?
@@ -9,5 +9,6 @@ else
9
9
  input = STDIN.read
10
10
  end
11
11
 
12
- kernel = Opener::POSTaggers::EN.new(:args => ARGV)
12
+ kernel = Opener::POSTaggers::EN.new(:args => ARGV)
13
+
13
14
  puts kernel.run(input)
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/opener/pos_taggers/en'
3
+ require_relative '../lib/opener/pos_taggers/en_es'
4
4
 
5
5
  # STDIN.tty? returns `false` if data is being piped into the current process.
6
6
  if STDIN.tty?
@@ -10,4 +10,5 @@ else
10
10
  end
11
11
 
12
12
  kernel = Opener::POSTaggers::ES.new(:args => ARGV)
13
+
13
14
  puts kernel.run(input)
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/opener/pos_taggers/en'
3
+ require_relative '../lib/opener/pos_taggers/en_es'
4
4
 
5
5
  # STDIN.tty? returns `false` if data is being piped into the current process.
6
6
  if STDIN.tty?
@@ -9,5 +9,6 @@ else
9
9
  input = STDIN.read
10
10
  end
11
11
 
12
- kernel = Opener::POSTaggers::FR.new(:args => ARGV)
12
+ kernel = Opener::POSTaggers::FR.new(:args => ARGV)
13
+
13
14
  puts kernel.run(input)
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/opener/pos_taggers/en'
3
+ require_relative '../lib/opener/pos_taggers/en_es'
4
4
 
5
5
  # STDIN.tty? returns `false` if data is being piped into the current process.
6
6
  if STDIN.tty?
@@ -9,5 +9,6 @@ else
9
9
  input = STDIN.read
10
10
  end
11
11
 
12
- kernel = Opener::POSTaggers::IT.new(:args => ARGV)
12
+ kernel = Opener::POSTaggers::IT.new(:args => ARGV)
13
+
13
14
  puts kernel.run(input)
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/opener/pos_taggers/en'
3
+ require_relative '../lib/opener/pos_taggers/en_es'
4
4
 
5
5
  # STDIN.tty? returns `false` if data is being piped into the current process.
6
6
  if STDIN.tty?
@@ -9,5 +9,6 @@ else
9
9
  input = STDIN.read
10
10
  end
11
11
 
12
- kernel = Opener::POSTaggers::NL.new(:args => ARGV)
12
+ kernel = Opener::POSTaggers::NL.new(:args => ARGV)
13
+
13
14
  puts kernel.run(input)
Binary file
@@ -1,118 +1,3 @@
1
- require 'open3'
2
- require 'stringio'
3
-
4
- require 'java'
5
-
6
- require File.expand_path('../../../../core/target/ehu-pos-1.0.jar', __FILE__)
7
-
8
- import 'java.io.InputStreamReader'
9
- import 'ixa.kaflib.KAFDocument'
10
- import 'ehu.pos.Annotate'
11
- import 'ehu.pos.Resources'
12
- import 'ehu.lemmatize.MorfologikLemmatizer'
13
- #import 'ehu.lemmatize.Dictionary'
14
- import 'ehu.lemmatize.LemmatizerDispatcher'
15
-
16
- require_relative 'en/version'
17
-
18
- module Opener
19
- module POSTaggers
20
- ##
21
- # The POS tagger that supports English and Spanish.
22
- #
23
- # @!attribute [r] args
24
- # @return [Array]
25
- # @!attribute [r] options
26
- # @return [Hash]
27
- #
28
- class EN
29
- attr_reader :args, :options
30
-
31
- ##
32
- # The default language to use.
33
- #
34
- # @return [String]
35
- #
36
- DEFAULT_LANGUAGE = 'en'.freeze
37
-
38
- ##
39
- # @param [Hash] options
40
- #
41
- # @option options [Array] :args The commandline arguments to pass to the
42
- # underlying Python script.
43
- #
44
- def initialize(options = {})
45
- @args = options.delete(:args) || []
46
- @options = options
47
- end
48
-
49
- ##
50
- # Runs the command and returns the output of STDOUT, STDERR and the
51
- # process information.
52
- #
53
- # @param [String] input The input to tag.
54
- # @return [Array]
55
- #
56
- def run(input)
57
- input = StringIO.new(input) unless input.kind_of?(IO)
58
- reader = InputStreamReader.new(input.to_inputstream)
59
- kaf = KAFDocument.create_from_stream(reader)
60
- annotator = Java::ehu.pos.Annotate.new(language)
61
-
62
- kaf.addLinguisticProcessor("terms","ehu-pos-"+language,"now","1.0");
63
- annotator.annotatePOSToKAF(kaf, lemmatizer, language)
64
-
65
- return kaf.to_string
66
- end
67
-
68
- protected
69
-
70
- ##
71
- # Returns the lemmatizer to use.
72
- #
73
- def lemmatizer
74
- return LemmatizerDispatcher.obtainMorfologikLemmatizer(language)
75
- end
76
-
77
- ##
78
- # @return [String]
79
- #
80
- def language
81
- return options[:language] || DEFAULT_LANGUAGE
82
- end
83
- end # EN
84
-
85
- class ES < EN
86
- ##
87
- # @return [String]
88
- #
89
- def language
90
- return 'es'
91
- end
92
- end # ES
93
-
94
- class NL < EN
95
- def language
96
- return 'nl'
97
- end
98
- end # NL
99
-
100
- class IT < EN
101
- ##
102
- # @return [String]
103
- #
104
- def language
105
- return 'it'
106
- end
107
- end # IT
108
-
109
- class FR < EN
110
- ##
111
- # @return [String]
112
- #
113
- def language
114
- return 'fr'
115
- end
116
- end # FR
117
- end # POSTaggers
118
- end # Opener
1
+ # This file exists for backwards compatibility, it's recommended to require
2
+ # en_es.rb instead.
3
+ require_relative 'en_es'
@@ -0,0 +1,19 @@
1
+ require 'open3'
2
+ require 'stringio'
3
+ require 'nokogiri'
4
+
5
+ require File.expand_path('../../../../core/target/ehu-pos-1.0.jar', __FILE__)
6
+
7
+ # import 'java.io.InputStreamReader'
8
+ # import 'ixa.kaflib.KAFDocument'
9
+ # import 'ehu.pos.Resources'
10
+ # import 'ehu.lemmatize.MorfologikLemmatizer'
11
+ # import 'ehu.lemmatize.LemmatizerDispatcher'
12
+
13
+ require_relative 'en_es/version'
14
+ require_relative 'en_es/en_es'
15
+ require_relative 'en_es/en'
16
+ require_relative 'en_es/es'
17
+ require_relative 'en_es/nl'
18
+ require_relative 'en_es/it'
19
+ require_relative 'en_es/fr'
@@ -0,0 +1,16 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # English POS tagger class. This class forces the language to English
5
+ # regardless of what the KAF document claims the language to be.
6
+ #
7
+ class EN < EnEs
8
+ ##
9
+ # @see [Opener::POSTaggers::Base#language_from_kaf]
10
+ #
11
+ def language_from_kaf(input)
12
+ return 'en'
13
+ end
14
+ end # EN
15
+ end # POSTaggers
16
+ end # Opener
@@ -0,0 +1,95 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # Base POS tagger class for the various language specific ones such as
5
+ # {OpeneR::POSTaggers::FR}.
6
+ #
7
+ # @!attribute [r] args
8
+ # @return [Array]
9
+ #
10
+ # @!attribute [r] options
11
+ # @return [Hash]
12
+ #
13
+ class EnEs
14
+ attr_reader :args, :options
15
+
16
+ ##
17
+ # The default options to use.
18
+ #
19
+ # @return [Hash]
20
+ #
21
+ DEFAULT_OPTIONS = {
22
+ :enable_time => true
23
+ }
24
+
25
+ ##
26
+ # @param [Hash] options
27
+ #
28
+ # @option options [Array] :args
29
+ #
30
+ # @option options [TrueClass|FalseClass] :enable_time When set to `true`
31
+ # (default) dynamic timestamps will be added.
32
+ #
33
+ def initialize(options = {})
34
+ @args = options.delete(:args) || []
35
+ @options = DEFAULT_OPTIONS.merge(options)
36
+ end
37
+
38
+ ##
39
+ # Runs the command and returns the resulting KAF document.
40
+ #
41
+ # @param [String] input The input to tag.
42
+ # @return [Array]
43
+ #
44
+ def run(input)
45
+ language = language_from_kaf(input)
46
+ input = StringIO.new(input)
47
+
48
+ reader = Java::java.io.InputStreamReader.new(input.to_inputstream)
49
+ kaf = Java::ixa.kaflib.KAFDocument.create_from_stream(reader)
50
+ annotator = new_annotator(language)
51
+
52
+ annotator.annotatePOSToKAF(kaf, lemmatizer(language), language)
53
+
54
+ return kaf.to_string
55
+ end
56
+
57
+ protected
58
+
59
+ ##
60
+ # Creates and configures a new annotator instance.
61
+ #
62
+ # @param [String] language
63
+ # @return [Java::ehy.pos.Annotate]
64
+ #
65
+ def new_annotator(language)
66
+ annotator = Java::ehu.pos.Annotate.new(language)
67
+
68
+ annotator.disableTimestamp unless options[:enable_time]
69
+
70
+ return annotator
71
+ end
72
+
73
+ ##
74
+ # Returns the lemmatizer to use.
75
+ #
76
+ # @param [String] language
77
+ #
78
+ def lemmatizer(language)
79
+ return Java::ehu.lemmatize.LemmatizerDispatcher.obtainMorfologikLemmatizer(language)
80
+ end
81
+
82
+ ##
83
+ # Returns the language for the given KAF document.
84
+ #
85
+ # @param [String] input
86
+ # @return [String]
87
+ #
88
+ def language_from_kaf(input)
89
+ document = Nokogiri::XML(input)
90
+
91
+ return document.at('KAF').attr('xml:lang')
92
+ end
93
+ end # Base
94
+ end # POSTaggers
95
+ end # Opener
@@ -0,0 +1,16 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # Spanish POS tagger class. This class forces the language to Spanish
5
+ # regardless of what the KAF document claims the language to be.
6
+ #
7
+ class ES < EnEs
8
+ ##
9
+ # @see [Opener::POSTaggers::Base#language_from_kaf]
10
+ #
11
+ def language_from_kaf(input)
12
+ return 'es'
13
+ end
14
+ end # ES
15
+ end # POSTaggers
16
+ end # Opener
@@ -0,0 +1,16 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # French POS tagger class. This class forces the language to French
5
+ # regardless of what the KAF document claims the language to be.
6
+ #
7
+ class FR < EnEs
8
+ ##
9
+ # @see [Opener::POSTaggers::Base#language_from_kaf]
10
+ #
11
+ def language_from_kaf(input)
12
+ return 'fr'
13
+ end
14
+ end # FR
15
+ end # POSTaggers
16
+ end # Opener
@@ -0,0 +1,16 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # Italian POS tagger class. This class forces the language to Italian
5
+ # regardless of what the KAF document claims the language to be.
6
+ #
7
+ class IT < EnEs
8
+ ##
9
+ # @see [Opener::POSTaggers::Base#language_from_kaf]
10
+ #
11
+ def language_from_kaf(input)
12
+ return 'it'
13
+ end
14
+ end # IT
15
+ end # POSTaggers
16
+ end # Opener
@@ -0,0 +1,16 @@
1
+ module Opener
2
+ module POSTaggers
3
+ ##
4
+ # Dutch POS tagger class. This class forces the language to Dutch
5
+ # regardless of what the KAF document claims the language to be.
6
+ #
7
+ class NL < EnEs
8
+ ##
9
+ # @see [Opener::POSTaggers::Base#language_from_kaf]
10
+ #
11
+ def language_from_kaf(input)
12
+ return 'nl'
13
+ end
14
+ end # NL
15
+ end # POSTaggers
16
+ end # Opener
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  module POSTaggers
3
- class EN
4
- VERSION = "2.0.2"
3
+ class EnEs
4
+ VERSION = "2.0.3"
5
5
  end
6
6
  end
7
7
  end
@@ -1,25 +1,30 @@
1
- require File.expand_path('../lib/opener/pos_taggers/en/version', __FILE__)
1
+ require File.expand_path('../lib/opener/pos_taggers/en_es/version', __FILE__)
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = "opener-pos-tagger-en-es"
5
- gem.version = Opener::POSTaggers::EN::VERSION
5
+ gem.version = Opener::POSTaggers::EnEs::VERSION
6
6
  gem.authors = ["development@olery.com"]
7
7
  gem.summary = 'POS tagging for English and Spanish'
8
8
  gem.description = gem.summary
9
9
  gem.homepage = "http://opener-project.github.com/"
10
10
  gem.has_rdoc = "yard"
11
11
 
12
+ gem.license = 'Apache 2.0'
13
+
12
14
  gem.required_ruby_version = ">= 1.9.2"
13
15
 
14
16
  gem.files = Dir.glob([
15
17
  'core/target/ehu-pos-*.jar',
16
18
  'lib/**/*',
17
19
  '*.gemspec',
18
- 'README.md'
20
+ 'README.md',
21
+ 'LICENSE.txt'
19
22
  ]).select { |file| File.file?(file) }
20
23
 
21
24
  gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
22
25
 
26
+ gem.add_dependency 'nokogiri'
27
+
23
28
  gem.add_development_dependency 'rspec', '~> 3.0'
24
29
  gem.add_development_dependency 'cucumber'
25
30
  gem.add_development_dependency 'rake'
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-pos-tagger-en-es
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 2.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-17 00:00:00.000000000 Z
11
+ date: 2014-06-30 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ requirement: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - '>='
23
+ - !ruby/object:Gem::Version
24
+ version: '0'
25
+ prerelease: false
26
+ type: :runtime
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: rspec
15
29
  version_requirements: !ruby/object:Gem::Requirement
@@ -79,16 +93,25 @@ extra_rdoc_files: []
79
93
  files:
80
94
  - core/target/ehu-pos-1.0.jar
81
95
  - lib/opener/pos_taggers/en.rb
82
- - lib/opener/pos_taggers/en/version.rb
96
+ - lib/opener/pos_taggers/en_es.rb
97
+ - lib/opener/pos_taggers/en_es/nl.rb
98
+ - lib/opener/pos_taggers/en_es/fr.rb
99
+ - lib/opener/pos_taggers/en_es/en.rb
100
+ - lib/opener/pos_taggers/en_es/it.rb
101
+ - lib/opener/pos_taggers/en_es/version.rb
102
+ - lib/opener/pos_taggers/en_es/en_es.rb
103
+ - lib/opener/pos_taggers/en_es/es.rb
83
104
  - opener-pos-tagger-en-es.gemspec
84
105
  - README.md
106
+ - LICENSE.txt
85
107
  - bin/pos-tagger-nl
86
108
  - bin/pos-tagger-en
87
109
  - bin/pos-tagger-fr
88
110
  - bin/pos-tagger-es
89
111
  - bin/pos-tagger-it
90
112
  homepage: http://opener-project.github.com/
91
- licenses: []
113
+ licenses:
114
+ - Apache 2.0
92
115
  metadata: {}
93
116
  post_install_message:
94
117
  rdoc_options: []