opener-pos-tagger-en-es 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +79 -0
- data/bin/pos-tagger-en +13 -0
- data/bin/pos-tagger-es +13 -0
- data/bin/pos-tagger-fr +13 -0
- data/bin/pos-tagger-it +13 -0
- data/bin/pos-tagger-nl +13 -0
- data/core/target/ehu-pos-1.0.jar +0 -0
- data/lib/opener/pos_taggers/en/version.rb +7 -0
- data/lib/opener/pos_taggers/en.rb +118 -0
- data/opener-pos-tagger-en-es.gemspec +27 -0
- metadata +114 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 057739e4c1a1282039f21cfc81467a5193578825
|
4
|
+
data.tar.gz: 22a50f89e3f63158ccabd9471d8dea434a11adbc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1736f4efcf334f7ff2c835e68df764c96af65e0d688105b313f6b31c06df9591db231984339c50b92a7c64d08e4a8811328eb62ec8b9eec340e7d62e85549030
|
7
|
+
data.tar.gz: 4813bf565fe3480ef1adb04faaee0c766fa4b16058cbdc6782fca5ff3d80fc24e7fd2bcf8e4c4b348e95237e2e413cabb1e1a78e4cda906dc26e77c58f8ab4c6
|
data/README.md
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
[](https://drone.io/github.com/opener-project/pos-tagger-en-es/latest)
|
2
|
+
|
3
|
+
# English, Spanish, Dutch, Italian, French POS Tagger
|
4
|
+
|
5
|
+
**Modified to also do Dutch**
|
6
|
+
|
7
|
+
This repository contains the source code for the English & Spanish POS tagger of the
|
8
|
+
OpeNER project.
|
9
|
+
|
10
|
+
English perceptron models have been trained and evaluated using the WSJ
|
11
|
+
treebank as explained in K. Toutanova, D. Klein, and C. D. Manning.
|
12
|
+
Feature-rich part-of-speech tagging with a cyclic dependency network. In
|
13
|
+
Proceedings of HLT-NAACL’03, 2003. Currently we obtain a performance of 96.48%
|
14
|
+
vs 97.24% obtained by Toutanova et al. (2003).
|
15
|
+
|
16
|
+
Spanish Maximum Entropy models have been trained and evaluated using the Ancora
|
17
|
+
corpus; it was randomly divided in 90% for training (440K words) and 10% testing
|
18
|
+
(70K words), obtaining a performance of 98.88%.
|
19
|
+
|
20
|
+
## Requirements
|
21
|
+
|
22
|
+
* Java 1.7 or newer
|
23
|
+
* Ruby 1.9.2 or newer
|
24
|
+
* Maven
|
25
|
+
* Bundler
|
26
|
+
|
27
|
+
## Installation
|
28
|
+
|
29
|
+
Using RubyGems:
|
30
|
+
|
31
|
+
gem install opener-pos-tagger-en-es
|
32
|
+
|
33
|
+
Using Bundler:
|
34
|
+
|
35
|
+
gem 'opener-pos-tagger-en-es',
|
36
|
+
:git => 'git@github.com/opener-project/pos-tagger-en-es.git',
|
37
|
+
:branch => 'master'
|
38
|
+
|
39
|
+
Using specific install:
|
40
|
+
|
41
|
+
gem install specific_install
|
42
|
+
gem specific_install opener-pos-tagger-en-es \
|
43
|
+
-l https://github.com/opener-project/pos-tagger-en-es.git
|
44
|
+
|
45
|
+
## Usage
|
46
|
+
|
47
|
+
cat some_input_file.kaf | pos-tagger-en-es
|
48
|
+
|
49
|
+
## Contributing
|
50
|
+
|
51
|
+
First make sure all the required dependencies are installed:
|
52
|
+
|
53
|
+
bundle install
|
54
|
+
|
55
|
+
Then compile the required Java code:
|
56
|
+
|
57
|
+
bundle exec rake java:compile
|
58
|
+
|
59
|
+
For this you'll need to have Java 1.7 and Maven installed. These requirements
|
60
|
+
are verified for you before the Rake task calls Maven.
|
61
|
+
|
62
|
+
## Testing
|
63
|
+
|
64
|
+
To run the tests (which are powered by Cucumber), simply run the following:
|
65
|
+
|
66
|
+
bundle exec rake
|
67
|
+
|
68
|
+
This will take care of verifying the requirements, installing the required Java
|
69
|
+
packages and running the tests.
|
70
|
+
|
71
|
+
For more information on the available Rake tasks run the following:
|
72
|
+
|
73
|
+
bundle exec rake -T
|
74
|
+
|
75
|
+
## Structure
|
76
|
+
|
77
|
+
This repository comes in two parts: a collection of Java source files and Ruby
|
78
|
+
source files. The Java code can be found in the `core/` directory, everything
|
79
|
+
else will be Ruby source code.
|
data/bin/pos-tagger-en
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/opener/pos_taggers/en'
|
4
|
+
|
5
|
+
# STDIN.tty? returns `false` if data is being piped into the current process.
|
6
|
+
if STDIN.tty?
|
7
|
+
input = nil
|
8
|
+
else
|
9
|
+
input = STDIN.read
|
10
|
+
end
|
11
|
+
|
12
|
+
kernel = Opener::POSTaggers::EN.new(:args => ARGV)
|
13
|
+
puts kernel.run(input)
|
data/bin/pos-tagger-es
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/opener/pos_taggers/en'
|
4
|
+
|
5
|
+
# STDIN.tty? returns `false` if data is being piped into the current process.
|
6
|
+
if STDIN.tty?
|
7
|
+
input = nil
|
8
|
+
else
|
9
|
+
input = STDIN.read
|
10
|
+
end
|
11
|
+
|
12
|
+
kernel = Opener::POSTaggers::ES.new(:args => ARGV)
|
13
|
+
puts kernel.run(input)
|
data/bin/pos-tagger-fr
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/opener/pos_taggers/en'
|
4
|
+
|
5
|
+
# STDIN.tty? returns `false` if data is being piped into the current process.
|
6
|
+
if STDIN.tty?
|
7
|
+
input = nil
|
8
|
+
else
|
9
|
+
input = STDIN.read
|
10
|
+
end
|
11
|
+
|
12
|
+
kernel = Opener::POSTaggers::FR.new(:args => ARGV)
|
13
|
+
puts kernel.run(input)
|
data/bin/pos-tagger-it
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/opener/pos_taggers/en'
|
4
|
+
|
5
|
+
# STDIN.tty? returns `false` if data is being piped into the current process.
|
6
|
+
if STDIN.tty?
|
7
|
+
input = nil
|
8
|
+
else
|
9
|
+
input = STDIN.read
|
10
|
+
end
|
11
|
+
|
12
|
+
kernel = Opener::POSTaggers::IT.new(:args => ARGV)
|
13
|
+
puts kernel.run(input)
|
data/bin/pos-tagger-nl
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/opener/pos_taggers/en'
|
4
|
+
|
5
|
+
# STDIN.tty? returns `false` if data is being piped into the current process.
|
6
|
+
if STDIN.tty?
|
7
|
+
input = nil
|
8
|
+
else
|
9
|
+
input = STDIN.read
|
10
|
+
end
|
11
|
+
|
12
|
+
kernel = Opener::POSTaggers::NL.new(:args => ARGV)
|
13
|
+
puts kernel.run(input)
|
Binary file
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
require 'java'
|
5
|
+
|
6
|
+
require File.expand_path('../../../../core/target/ehu-pos-1.0.jar', __FILE__)
|
7
|
+
|
8
|
+
import 'java.io.InputStreamReader'
|
9
|
+
import 'ixa.kaflib.KAFDocument'
|
10
|
+
import 'ehu.pos.Annotate'
|
11
|
+
import 'ehu.pos.Resources'
|
12
|
+
import 'ehu.lemmatize.MorfologikLemmatizer'
|
13
|
+
import 'ehu.lemmatize.Dictionary'
|
14
|
+
|
15
|
+
require_relative 'en/version'
|
16
|
+
|
17
|
+
module Opener
|
18
|
+
module POSTaggers
|
19
|
+
##
|
20
|
+
# The POS tagger that supports English and Spanish.
|
21
|
+
#
|
22
|
+
# @!attribute [r] args
|
23
|
+
# @return [Array]
|
24
|
+
# @!attribute [r] options
|
25
|
+
# @return [Hash]
|
26
|
+
#
|
27
|
+
class EN
|
28
|
+
attr_reader :args, :options
|
29
|
+
|
30
|
+
##
|
31
|
+
# The default language to use.
|
32
|
+
#
|
33
|
+
# @return [String]
|
34
|
+
#
|
35
|
+
DEFAULT_LANGUAGE = 'en'.freeze
|
36
|
+
|
37
|
+
##
|
38
|
+
# @param [Hash] options
|
39
|
+
#
|
40
|
+
# @option options [Array] :args The commandline arguments to pass to the
|
41
|
+
# underlying Python script.
|
42
|
+
#
|
43
|
+
def initialize(options = {})
|
44
|
+
@args = options.delete(:args) || []
|
45
|
+
@options = options
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# Runs the command and returns the output of STDOUT, STDERR and the
|
50
|
+
# process information.
|
51
|
+
#
|
52
|
+
# @param [String] input The input to tag.
|
53
|
+
# @return [Array]
|
54
|
+
#
|
55
|
+
def run(input)
|
56
|
+
input = StringIO.new(input) unless input.kind_of?(IO)
|
57
|
+
reader = InputStreamReader.new(input.to_inputstream)
|
58
|
+
kaf = KAFDocument.create_from_stream(reader)
|
59
|
+
annotator = Java::ehu.pos.Annotate.new(language)
|
60
|
+
|
61
|
+
kaf.addLinguisticProcessor("terms","ehu-pos-"+language,"now","1.0");
|
62
|
+
annotator.annotatePOSToKAF(kaf, lemmatizer, language)
|
63
|
+
|
64
|
+
return kaf.to_string
|
65
|
+
end
|
66
|
+
|
67
|
+
protected
|
68
|
+
|
69
|
+
def dictionary
|
70
|
+
Resources.new.getBinaryDict(language)
|
71
|
+
end
|
72
|
+
|
73
|
+
def lemmatizer
|
74
|
+
MorfologikLemmatizer.new(dictionary)
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# @return [String]
|
79
|
+
#
|
80
|
+
def language
|
81
|
+
return options[:language] || DEFAULT_LANGUAGE
|
82
|
+
end
|
83
|
+
end # EN
|
84
|
+
|
85
|
+
class ES < EN
|
86
|
+
##
|
87
|
+
# @return [String]
|
88
|
+
#
|
89
|
+
def language
|
90
|
+
return 'es'
|
91
|
+
end
|
92
|
+
end # ES
|
93
|
+
|
94
|
+
class NL < EN
|
95
|
+
def language
|
96
|
+
return 'nl'
|
97
|
+
end
|
98
|
+
end # NL
|
99
|
+
|
100
|
+
class IT < EN
|
101
|
+
##
|
102
|
+
# @return [String]
|
103
|
+
#
|
104
|
+
def language
|
105
|
+
return 'it'
|
106
|
+
end
|
107
|
+
end # IT
|
108
|
+
|
109
|
+
class FR < EN
|
110
|
+
##
|
111
|
+
# @return [String]
|
112
|
+
#
|
113
|
+
def language
|
114
|
+
return 'fr'
|
115
|
+
end
|
116
|
+
end # FR
|
117
|
+
end # POSTaggers
|
118
|
+
end # Opener
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require File.expand_path('../lib/opener/pos_taggers/en/version', __FILE__)
|
2
|
+
|
3
|
+
Gem::Specification.new do |gem|
|
4
|
+
gem.name = "opener-pos-tagger-en-es"
|
5
|
+
gem.version = Opener::POSTaggers::EN::VERSION
|
6
|
+
gem.authors = ["development@olery.com"]
|
7
|
+
gem.summary = "POS tagging for english, spanish, dutch, italian and french"
|
8
|
+
gem.description = gem.summary
|
9
|
+
gem.homepage = "http://opener-project.github.com/"
|
10
|
+
gem.has_rdoc = "yard"
|
11
|
+
gem.required_ruby_version = ">= 1.9.2"
|
12
|
+
|
13
|
+
gem.files = Dir.glob([
|
14
|
+
'core/target/ehu-pos-*.jar',
|
15
|
+
'lib/**/*',
|
16
|
+
'*.gemspec',
|
17
|
+
'README.md'
|
18
|
+
]).select { |file| File.file?(file) }
|
19
|
+
|
20
|
+
gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
|
21
|
+
|
22
|
+
gem.add_dependency 'opener-build-tools'
|
23
|
+
|
24
|
+
gem.add_development_dependency 'rspec'
|
25
|
+
gem.add_development_dependency 'cucumber'
|
26
|
+
gem.add_development_dependency 'rake'
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: opener-pos-tagger-en-es
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- development@olery.com
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-20 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: opener-build-tools
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: cucumber
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: POS tagging for english, spanish, dutch, italian and french
|
70
|
+
email:
|
71
|
+
executables:
|
72
|
+
- pos-tagger-nl
|
73
|
+
- pos-tagger-en
|
74
|
+
- pos-tagger-fr
|
75
|
+
- pos-tagger-es
|
76
|
+
- pos-tagger-it
|
77
|
+
extensions: []
|
78
|
+
extra_rdoc_files: []
|
79
|
+
files:
|
80
|
+
- README.md
|
81
|
+
- bin/pos-tagger-en
|
82
|
+
- bin/pos-tagger-es
|
83
|
+
- bin/pos-tagger-fr
|
84
|
+
- bin/pos-tagger-it
|
85
|
+
- bin/pos-tagger-nl
|
86
|
+
- core/target/ehu-pos-1.0.jar
|
87
|
+
- lib/opener/pos_taggers/en.rb
|
88
|
+
- lib/opener/pos_taggers/en/version.rb
|
89
|
+
- opener-pos-tagger-en-es.gemspec
|
90
|
+
homepage: http://opener-project.github.com/
|
91
|
+
licenses: []
|
92
|
+
metadata: {}
|
93
|
+
post_install_message:
|
94
|
+
rdoc_options: []
|
95
|
+
require_paths:
|
96
|
+
- lib
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.9.2
|
102
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
requirements: []
|
108
|
+
rubyforge_project:
|
109
|
+
rubygems_version: 2.2.2
|
110
|
+
signing_key:
|
111
|
+
specification_version: 4
|
112
|
+
summary: POS tagging for english, spanish, dutch, italian and french
|
113
|
+
test_files: []
|
114
|
+
has_rdoc: yard
|