opener-pos-tagger-en-es 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 057739e4c1a1282039f21cfc81467a5193578825
4
+ data.tar.gz: 22a50f89e3f63158ccabd9471d8dea434a11adbc
5
+ SHA512:
6
+ metadata.gz: 1736f4efcf334f7ff2c835e68df764c96af65e0d688105b313f6b31c06df9591db231984339c50b92a7c64d08e4a8811328eb62ec8b9eec340e7d62e85549030
7
+ data.tar.gz: 4813bf565fe3480ef1adb04faaee0c766fa4b16058cbdc6782fca5ff3d80fc24e7fd2bcf8e4c4b348e95237e2e413cabb1e1a78e4cda906dc26e77c58f8ab4c6
data/README.md ADDED
@@ -0,0 +1,79 @@
1
+ [![Build Status](https://drone.io/github.com/opener-project/pos-tagger-en-es/status.png)](https://drone.io/github.com/opener-project/pos-tagger-en-es/latest)
2
+
3
+ # English, Spanish, Dutch, Italian, French POS Tagger
4
+
5
+ **Modified to also do Dutch**
6
+
7
+ This repository contains the source code for the English & Spanish POS tagger of the
8
+ OpeNER project.
9
+
10
+ English perceptron models have been trained and evaluated using the WSJ
11
+ treebank as explained in K. Toutanova, D. Klein, and C. D. Manning.
12
+ Feature-rich part-of-speech tagging with a cyclic dependency network. In
13
+ Proceedings of HLT-NAACL’03, 2003. Currently we obtain a performance of 96.48%
14
+ vs 97.24% obtained by Toutanova et al. (2003).
15
+
16
+ Spanish Maximum Entropy models have been trained and evaluated using the Ancora
17
+ corpus; it was randomly divided in 90% for training (440K words) and 10% testing
18
+ (70K words), obtaining a performance of 98.88%.
19
+
20
+ ## Requirements
21
+
22
+ * Java 1.7 or newer
23
+ * Ruby 1.9.2 or newer
24
+ * Maven
25
+ * Bundler
26
+
27
+ ## Installation
28
+
29
+ Using RubyGems:
30
+
31
+ gem install opener-pos-tagger-en-es
32
+
33
+ Using Bundler:
34
+
35
+ gem 'opener-pos-tagger-en-es',
36
+ :git => 'git@github.com/opener-project/pos-tagger-en-es.git',
37
+ :branch => 'master'
38
+
39
+ Using specific install:
40
+
41
+ gem install specific_install
42
+ gem specific_install opener-pos-tagger-en-es \
43
+ -l https://github.com/opener-project/pos-tagger-en-es.git
44
+
45
+ ## Usage
46
+
47
+ cat some_input_file.kaf | pos-tagger-en-es
48
+
49
+ ## Contributing
50
+
51
+ First make sure all the required dependencies are installed:
52
+
53
+ bundle install
54
+
55
+ Then compile the required Java code:
56
+
57
+ bundle exec rake java:compile
58
+
59
+ For this you'll need to have Java 1.7 and Maven installed. These requirements
60
+ are verified for you before the Rake task calls Maven.
61
+
62
+ ## Testing
63
+
64
+ To run the tests (which are powered by Cucumber), simply run the following:
65
+
66
+ bundle exec rake
67
+
68
+ This will take care of verifying the requirements, installing the required Java
69
+ packages and running the tests.
70
+
71
+ For more information on the available Rake tasks run the following:
72
+
73
+ bundle exec rake -T
74
+
75
+ ## Structure
76
+
77
+ This repository comes in two parts: a collection of Java source files and Ruby
78
+ source files. The Java code can be found in the `core/` directory, everything
79
+ else will be Ruby source code.
data/bin/pos-tagger-en ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/en'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::EN.new(:args => ARGV)
13
+ puts kernel.run(input)
data/bin/pos-tagger-es ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/en'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::ES.new(:args => ARGV)
13
+ puts kernel.run(input)
data/bin/pos-tagger-fr ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/en'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::FR.new(:args => ARGV)
13
+ puts kernel.run(input)
data/bin/pos-tagger-it ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/en'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::IT.new(:args => ARGV)
13
+ puts kernel.run(input)
data/bin/pos-tagger-nl ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/en'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::NL.new(:args => ARGV)
13
+ puts kernel.run(input)
Binary file
@@ -0,0 +1,7 @@
1
+ module Opener
2
+ module POSTaggers
3
+ class EN
4
+ VERSION = "2.0.0"
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,118 @@
1
+ require 'open3'
2
+ require 'stringio'
3
+
4
+ require 'java'
5
+
6
+ require File.expand_path('../../../../core/target/ehu-pos-1.0.jar', __FILE__)
7
+
8
+ import 'java.io.InputStreamReader'
9
+ import 'ixa.kaflib.KAFDocument'
10
+ import 'ehu.pos.Annotate'
11
+ import 'ehu.pos.Resources'
12
+ import 'ehu.lemmatize.MorfologikLemmatizer'
13
+ import 'ehu.lemmatize.Dictionary'
14
+
15
+ require_relative 'en/version'
16
+
17
+ module Opener
18
+ module POSTaggers
19
+ ##
20
+ # The POS tagger that supports English and Spanish.
21
+ #
22
+ # @!attribute [r] args
23
+ # @return [Array]
24
+ # @!attribute [r] options
25
+ # @return [Hash]
26
+ #
27
+ class EN
28
+ attr_reader :args, :options
29
+
30
+ ##
31
+ # The default language to use.
32
+ #
33
+ # @return [String]
34
+ #
35
+ DEFAULT_LANGUAGE = 'en'.freeze
36
+
37
+ ##
38
+ # @param [Hash] options
39
+ #
40
+ # @option options [Array] :args The commandline arguments to pass to the
41
+ # underlying Python script.
42
+ #
43
+ def initialize(options = {})
44
+ @args = options.delete(:args) || []
45
+ @options = options
46
+ end
47
+
48
+ ##
49
+ # Runs the command and returns the output of STDOUT, STDERR and the
50
+ # process information.
51
+ #
52
+ # @param [String] input The input to tag.
53
+ # @return [Array]
54
+ #
55
+ def run(input)
56
+ input = StringIO.new(input) unless input.kind_of?(IO)
57
+ reader = InputStreamReader.new(input.to_inputstream)
58
+ kaf = KAFDocument.create_from_stream(reader)
59
+ annotator = Java::ehu.pos.Annotate.new(language)
60
+
61
+ kaf.addLinguisticProcessor("terms","ehu-pos-"+language,"now","1.0");
62
+ annotator.annotatePOSToKAF(kaf, lemmatizer, language)
63
+
64
+ return kaf.to_string
65
+ end
66
+
67
+ protected
68
+
69
+ def dictionary
70
+ Resources.new.getBinaryDict(language)
71
+ end
72
+
73
+ def lemmatizer
74
+ MorfologikLemmatizer.new(dictionary)
75
+ end
76
+
77
+ ##
78
+ # @return [String]
79
+ #
80
+ def language
81
+ return options[:language] || DEFAULT_LANGUAGE
82
+ end
83
+ end # EN
84
+
85
+ class ES < EN
86
+ ##
87
+ # @return [String]
88
+ #
89
+ def language
90
+ return 'es'
91
+ end
92
+ end # ES
93
+
94
+ class NL < EN
95
+ def language
96
+ return 'nl'
97
+ end
98
+ end # NL
99
+
100
+ class IT < EN
101
+ ##
102
+ # @return [String]
103
+ #
104
+ def language
105
+ return 'it'
106
+ end
107
+ end # IT
108
+
109
+ class FR < EN
110
+ ##
111
+ # @return [String]
112
+ #
113
+ def language
114
+ return 'fr'
115
+ end
116
+ end # FR
117
+ end # POSTaggers
118
+ end # Opener
@@ -0,0 +1,27 @@
1
+ require File.expand_path('../lib/opener/pos_taggers/en/version', __FILE__)
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.name = "opener-pos-tagger-en-es"
5
+ gem.version = Opener::POSTaggers::EN::VERSION
6
+ gem.authors = ["development@olery.com"]
7
+ gem.summary = "POS tagging for english, spanish, dutch, italian and french"
8
+ gem.description = gem.summary
9
+ gem.homepage = "http://opener-project.github.com/"
10
+ gem.has_rdoc = "yard"
11
+ gem.required_ruby_version = ">= 1.9.2"
12
+
13
+ gem.files = Dir.glob([
14
+ 'core/target/ehu-pos-*.jar',
15
+ 'lib/**/*',
16
+ '*.gemspec',
17
+ 'README.md'
18
+ ]).select { |file| File.file?(file) }
19
+
20
+ gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
21
+
22
+ gem.add_dependency 'opener-build-tools'
23
+
24
+ gem.add_development_dependency 'rspec'
25
+ gem.add_development_dependency 'cucumber'
26
+ gem.add_development_dependency 'rake'
27
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: opener-pos-tagger-en-es
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.0.0
5
+ platform: ruby
6
+ authors:
7
+ - development@olery.com
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: opener-build-tools
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: cucumber
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: POS tagging for english, spanish, dutch, italian and french
70
+ email:
71
+ executables:
72
+ - pos-tagger-nl
73
+ - pos-tagger-en
74
+ - pos-tagger-fr
75
+ - pos-tagger-es
76
+ - pos-tagger-it
77
+ extensions: []
78
+ extra_rdoc_files: []
79
+ files:
80
+ - README.md
81
+ - bin/pos-tagger-en
82
+ - bin/pos-tagger-es
83
+ - bin/pos-tagger-fr
84
+ - bin/pos-tagger-it
85
+ - bin/pos-tagger-nl
86
+ - core/target/ehu-pos-1.0.jar
87
+ - lib/opener/pos_taggers/en.rb
88
+ - lib/opener/pos_taggers/en/version.rb
89
+ - opener-pos-tagger-en-es.gemspec
90
+ homepage: http://opener-project.github.com/
91
+ licenses: []
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: 1.9.2
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.2.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: POS tagging for english, spanish, dutch, italian and french
113
+ test_files: []
114
+ has_rdoc: yard