opener-pos-tagger-en-es 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 057739e4c1a1282039f21cfc81467a5193578825
4
+ data.tar.gz: 22a50f89e3f63158ccabd9471d8dea434a11adbc
5
+ SHA512:
6
+ metadata.gz: 1736f4efcf334f7ff2c835e68df764c96af65e0d688105b313f6b31c06df9591db231984339c50b92a7c64d08e4a8811328eb62ec8b9eec340e7d62e85549030
7
+ data.tar.gz: 4813bf565fe3480ef1adb04faaee0c766fa4b16058cbdc6782fca5ff3d80fc24e7fd2bcf8e4c4b348e95237e2e413cabb1e1a78e4cda906dc26e77c58f8ab4c6
data/README.md ADDED
@@ -0,0 +1,79 @@
1
+ [![Build Status](https://drone.io/github.com/opener-project/pos-tagger-en-es/status.png)](https://drone.io/github.com/opener-project/pos-tagger-en-es/latest)
2
+
3
+ # English, Spanish, Dutch, Italian, French POS Tagger
4
+
5
+ **Modified to also do Dutch**
6
+
7
+ This repository contains the source code for the English & Spanish POS tagger of the
8
+ OpeNER project.
9
+
10
+ English perceptron models have been trained and evaluated using the WSJ
11
+ treebank as explained in K. Toutanova, D. Klein, and C. D. Manning.
12
+ Feature-rich part-of-speech tagging with a cyclic dependency network. In
13
+ Proceedings of HLT-NAACL’03, 2003. Currently we obtain a performance of 96.48%
14
+ vs 97.24% obtained by Toutanova et al. (2003).
15
+
16
+ Spanish Maximum Entropy models have been trained and evaluated using the Ancora
17
+ corpus; it was randomly divided in 90% for training (440K words) and 10% testing
18
+ (70K words), obtaining a performance of 98.88%.
19
+
20
+ ## Requirements
21
+
22
+ * Java 1.7 or newer
23
+ * Ruby 1.9.2 or newer
24
+ * Maven
25
+ * Bundler
26
+
27
+ ## Installation
28
+
29
+ Using RubyGems:
30
+
31
+ gem install opener-pos-tagger-en-es
32
+
33
+ Using Bundler:
34
+
35
+ gem 'opener-pos-tagger-en-es',
36
+ :git => 'git@github.com/opener-project/pos-tagger-en-es.git',
37
+ :branch => 'master'
38
+
39
+ Using specific install:
40
+
41
+ gem install specific_install
42
+ gem specific_install opener-pos-tagger-en-es \
43
+ -l https://github.com/opener-project/pos-tagger-en-es.git
44
+
45
+ ## Usage
46
+
47
+ cat some_input_file.kaf | pos-tagger-en-es
48
+
49
+ ## Contributing
50
+
51
+ First make sure all the required dependencies are installed:
52
+
53
+ bundle install
54
+
55
+ Then compile the required Java code:
56
+
57
+ bundle exec rake java:compile
58
+
59
+ For this you'll need to have Java 1.7 and Maven installed. These requirements
60
+ are verified for you before the Rake task calls Maven.
61
+
62
+ ## Testing
63
+
64
+ To run the tests (which are powered by Cucumber), simply run the following:
65
+
66
+ bundle exec rake
67
+
68
+ This will take care of verifying the requirements, installing the required Java
69
+ packages and running the tests.
70
+
71
+ For more information on the available Rake tasks run the following:
72
+
73
+ bundle exec rake -T
74
+
75
+ ## Structure
76
+
77
+ This repository comes in two parts: a collection of Java source files and Ruby
78
+ source files. The Java code can be found in the `core/` directory, everything
79
+ else will be Ruby source code.
data/bin/pos-tagger-en ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/en'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::EN.new(:args => ARGV)
13
+ puts kernel.run(input)
data/bin/pos-tagger-es ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/en'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::ES.new(:args => ARGV)
13
+ puts kernel.run(input)
data/bin/pos-tagger-fr ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/en'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::FR.new(:args => ARGV)
13
+ puts kernel.run(input)
data/bin/pos-tagger-it ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/en'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::IT.new(:args => ARGV)
13
+ puts kernel.run(input)
data/bin/pos-tagger-nl ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/pos_taggers/en'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::POSTaggers::NL.new(:args => ARGV)
13
+ puts kernel.run(input)
Binary file
@@ -0,0 +1,7 @@
1
+ module Opener
2
+ module POSTaggers
3
+ class EN
4
+ VERSION = "2.0.0"
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,118 @@
1
+ require 'open3'
2
+ require 'stringio'
3
+
4
+ require 'java'
5
+
6
+ require File.expand_path('../../../../core/target/ehu-pos-1.0.jar', __FILE__)
7
+
8
+ import 'java.io.InputStreamReader'
9
+ import 'ixa.kaflib.KAFDocument'
10
+ import 'ehu.pos.Annotate'
11
+ import 'ehu.pos.Resources'
12
+ import 'ehu.lemmatize.MorfologikLemmatizer'
13
+ import 'ehu.lemmatize.Dictionary'
14
+
15
+ require_relative 'en/version'
16
+
17
+ module Opener
18
+ module POSTaggers
19
+ ##
20
+ # The POS tagger that supports English and Spanish.
21
+ #
22
+ # @!attribute [r] args
23
+ # @return [Array]
24
+ # @!attribute [r] options
25
+ # @return [Hash]
26
+ #
27
+ class EN
28
+ attr_reader :args, :options
29
+
30
+ ##
31
+ # The default language to use.
32
+ #
33
+ # @return [String]
34
+ #
35
+ DEFAULT_LANGUAGE = 'en'.freeze
36
+
37
+ ##
38
+ # @param [Hash] options
39
+ #
40
+ # @option options [Array] :args The commandline arguments to pass to the
41
+ # underlying Python script.
42
+ #
43
+ def initialize(options = {})
44
+ @args = options.delete(:args) || []
45
+ @options = options
46
+ end
47
+
48
+ ##
49
+ # Runs the command and returns the output of STDOUT, STDERR and the
50
+ # process information.
51
+ #
52
+ # @param [String] input The input to tag.
53
+ # @return [Array]
54
+ #
55
+ def run(input)
56
+ input = StringIO.new(input) unless input.kind_of?(IO)
57
+ reader = InputStreamReader.new(input.to_inputstream)
58
+ kaf = KAFDocument.create_from_stream(reader)
59
+ annotator = Java::ehu.pos.Annotate.new(language)
60
+
61
+ kaf.addLinguisticProcessor("terms","ehu-pos-"+language,"now","1.0");
62
+ annotator.annotatePOSToKAF(kaf, lemmatizer, language)
63
+
64
+ return kaf.to_string
65
+ end
66
+
67
+ protected
68
+
69
+ def dictionary
70
+ Resources.new.getBinaryDict(language)
71
+ end
72
+
73
+ def lemmatizer
74
+ MorfologikLemmatizer.new(dictionary)
75
+ end
76
+
77
+ ##
78
+ # @return [String]
79
+ #
80
+ def language
81
+ return options[:language] || DEFAULT_LANGUAGE
82
+ end
83
+ end # EN
84
+
85
+ class ES < EN
86
+ ##
87
+ # @return [String]
88
+ #
89
+ def language
90
+ return 'es'
91
+ end
92
+ end # ES
93
+
94
+ class NL < EN
95
+ def language
96
+ return 'nl'
97
+ end
98
+ end # NL
99
+
100
+ class IT < EN
101
+ ##
102
+ # @return [String]
103
+ #
104
+ def language
105
+ return 'it'
106
+ end
107
+ end # IT
108
+
109
+ class FR < EN
110
+ ##
111
+ # @return [String]
112
+ #
113
+ def language
114
+ return 'fr'
115
+ end
116
+ end # FR
117
+ end # POSTaggers
118
+ end # Opener
@@ -0,0 +1,27 @@
1
+ require File.expand_path('../lib/opener/pos_taggers/en/version', __FILE__)
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.name = "opener-pos-tagger-en-es"
5
+ gem.version = Opener::POSTaggers::EN::VERSION
6
+ gem.authors = ["development@olery.com"]
7
+ gem.summary = "POS tagging for english, spanish, dutch, italian and french"
8
+ gem.description = gem.summary
9
+ gem.homepage = "http://opener-project.github.com/"
10
+ gem.has_rdoc = "yard"
11
+ gem.required_ruby_version = ">= 1.9.2"
12
+
13
+ gem.files = Dir.glob([
14
+ 'core/target/ehu-pos-*.jar',
15
+ 'lib/**/*',
16
+ '*.gemspec',
17
+ 'README.md'
18
+ ]).select { |file| File.file?(file) }
19
+
20
+ gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
21
+
22
+ gem.add_dependency 'opener-build-tools'
23
+
24
+ gem.add_development_dependency 'rspec'
25
+ gem.add_development_dependency 'cucumber'
26
+ gem.add_development_dependency 'rake'
27
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: opener-pos-tagger-en-es
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.0.0
5
+ platform: ruby
6
+ authors:
7
+ - development@olery.com
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: opener-build-tools
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: cucumber
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: POS tagging for english, spanish, dutch, italian and french
70
+ email:
71
+ executables:
72
+ - pos-tagger-nl
73
+ - pos-tagger-en
74
+ - pos-tagger-fr
75
+ - pos-tagger-es
76
+ - pos-tagger-it
77
+ extensions: []
78
+ extra_rdoc_files: []
79
+ files:
80
+ - README.md
81
+ - bin/pos-tagger-en
82
+ - bin/pos-tagger-es
83
+ - bin/pos-tagger-fr
84
+ - bin/pos-tagger-it
85
+ - bin/pos-tagger-nl
86
+ - core/target/ehu-pos-1.0.jar
87
+ - lib/opener/pos_taggers/en.rb
88
+ - lib/opener/pos_taggers/en/version.rb
89
+ - opener-pos-tagger-en-es.gemspec
90
+ homepage: http://opener-project.github.com/
91
+ licenses: []
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: 1.9.2
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.2.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: POS tagging for english, spanish, dutch, italian and french
113
+ test_files: []
114
+ has_rdoc: yard