opener-language-identifier 3.1.7 → 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -3
- data/bin/language-identifier +3 -2
- data/bin/language-identifier-daemon +5 -5
- data/bin/language-identifier-server +6 -4
- data/core/target/LanguageDetection-0.0.1.jar +0 -0
- data/exec/language-identifier.rb +2 -2
- data/lib/opener/language_identifier.rb +8 -8
- data/lib/opener/language_identifier/cli.rb +54 -113
- data/lib/opener/language_identifier/detector.rb +5 -0
- data/lib/opener/language_identifier/server.rb +3 -21
- data/lib/opener/language_identifier/version.rb +1 -1
- data/lib/opener/language_identifier/views/index.erb +1 -11
- data/opener-language-identifier.gemspec +4 -7
- metadata +13 -69
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1c103a6e78b0e47383c82198173460c555eefe19
|
4
|
+
data.tar.gz: 44db5e6da5a34746ef977773fe5bba7b093cd49b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b05e38f97c517b1f4c6527f2d4e941de89e145942aabb2cad675a5938643250a857b37a031896b10f1aad76d553967fe9435c4f9fa956861a1b24419b115fe5
|
7
|
+
data.tar.gz: 8a2aada54780b6f0c0c8f47adf68a0477885ea864e59e088c3d9f4a5e8ca9903ac32d7620c9fac58c78d296b7eb717215ffc9bcbe7d796f88c11385c75d20c0a
|
data/README.md
CHANGED
@@ -4,9 +4,7 @@
|
|
4
4
|
|
5
5
|
The language identifier takes raw text and tries to figure out what language it
|
6
6
|
was written in. The output can either be a plain-text i18n language code or a
|
7
|
-
basic KAF document containing the language and raw input text.
|
8
|
-
|
9
|
-
The output of the language identifier can then be used to drive further text
|
7
|
+
basic KAF document containing the language and raw input text. The output of the language identifier can then be used to drive further text
|
10
8
|
analysis of for example sentiments and or entities.
|
11
9
|
|
12
10
|
## Confused by some terminology?
|
data/bin/language-identifier
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
|
3
3
|
require 'opener/daemons'
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
:name => "language-identifier",
|
9
|
-
:exec_path => exec_path
|
5
|
+
controller = Opener::Daemons::Controller.new(
|
6
|
+
:name => 'opener-language-identifier',
|
7
|
+
:exec_path => File.expand_path("../../exec/language-identifier.rb", __FILE__)
|
10
8
|
)
|
9
|
+
|
10
|
+
controller.run
|
@@ -1,8 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'opener/webservice'
|
4
4
|
|
5
|
-
|
5
|
+
parser = Opener::Webservice::OptionParser.new(
|
6
|
+
'opener-language-identifier',
|
7
|
+
File.expand_path('../../config.ru', __FILE__)
|
8
|
+
)
|
6
9
|
|
7
|
-
|
8
|
-
cli.run
|
10
|
+
parser.run
|
Binary file
|
data/exec/language-identifier.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'opener/daemons'
|
4
|
+
|
4
5
|
require_relative '../lib/opener/language_identifier'
|
5
6
|
|
6
|
-
|
7
|
-
daemon = Opener::Daemons::Daemon.new(Opener::LanguageIdentifier, options)
|
7
|
+
daemon = Opener::Daemons::Daemon.new(Opener::LanguageIdentifier)
|
8
8
|
|
9
9
|
daemon.start
|
@@ -1,8 +1,7 @@
|
|
1
|
+
require 'java'
|
1
2
|
require 'open3'
|
2
|
-
require '
|
3
|
+
require 'slop'
|
3
4
|
require 'builder'
|
4
|
-
require 'java'
|
5
|
-
require 'opener/core'
|
6
5
|
|
7
6
|
require_relative '../../core/target/LanguageDetection-0.0.1.jar'
|
8
7
|
import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
|
@@ -29,8 +28,9 @@ module Opener
|
|
29
28
|
# @return [Hash]
|
30
29
|
#
|
31
30
|
DEFAULT_OPTIONS = {
|
32
|
-
:args
|
33
|
-
:kaf
|
31
|
+
:args => [],
|
32
|
+
:kaf => true,
|
33
|
+
:probs => false
|
34
34
|
}.freeze
|
35
35
|
|
36
36
|
##
|
@@ -42,6 +42,9 @@ module Opener
|
|
42
42
|
# @option options [TrueClass|FalseClass] :kaf When set to `true` the
|
43
43
|
# results will be displayed as KAF.
|
44
44
|
#
|
45
|
+
# @option options [TrueClass|FalseClass] :probs Wen set the probabilities
|
46
|
+
# are returned instead of the language/KAF.
|
47
|
+
#
|
45
48
|
def initialize(options = {})
|
46
49
|
@options = DEFAULT_OPTIONS.merge(options)
|
47
50
|
@detector = Detector.instance
|
@@ -63,9 +66,6 @@ module Opener
|
|
63
66
|
end
|
64
67
|
|
65
68
|
return output
|
66
|
-
|
67
|
-
rescue Exception => error
|
68
|
-
return Opener::Core::ErrorLayer.new(input, error.message, self.class).add
|
69
69
|
end
|
70
70
|
|
71
71
|
alias identify run
|
@@ -1,138 +1,79 @@
|
|
1
1
|
module Opener
|
2
2
|
class LanguageIdentifier
|
3
3
|
##
|
4
|
-
# CLI wrapper around {Opener::LanguageIdentifier} using
|
4
|
+
# CLI wrapper around {Opener::LanguageIdentifier} using Slop.
|
5
5
|
#
|
6
|
-
# @!attribute [r]
|
7
|
-
# @return [
|
8
|
-
#
|
9
|
-
# @!attribute [r] option_parser
|
10
|
-
# @return [OptionParser]
|
6
|
+
# @!attribute [r] parser
|
7
|
+
# @return [Slop]
|
11
8
|
#
|
12
9
|
class CLI
|
13
|
-
attr_reader :
|
10
|
+
attr_reader :parser
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@parser = configure_slop
|
14
|
+
end
|
14
15
|
|
15
16
|
##
|
16
|
-
# @param [
|
17
|
+
# @param [Array] argv
|
17
18
|
#
|
18
|
-
def
|
19
|
-
|
19
|
+
def run(argv = ARGV)
|
20
|
+
parser.parse(argv)
|
21
|
+
end
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
23
|
+
##
|
24
|
+
# @return [Slop]
|
25
|
+
#
|
26
|
+
def configure_slop
|
27
|
+
return Slop.new(:strict => false, :indent => 2, :help => true) do
|
28
|
+
banner 'Usage: language-identifier [OPTIONS]'
|
24
29
|
|
25
|
-
|
26
|
-
show_version
|
27
|
-
end
|
30
|
+
separator <<-EOF.chomp
|
28
31
|
|
29
|
-
|
30
|
-
@options[:kaf] = v
|
31
|
-
end
|
32
|
+
About:
|
32
33
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
34
|
+
Language detection for various languages such as English and Dutch. This
|
35
|
+
command reads input from STDIN. Output can be a language code as plain text,
|
36
|
+
a KAF document containing the input text and language code, or a list of
|
37
|
+
probabilities.
|
38
|
+
|
39
|
+
Example:
|
37
40
|
|
38
|
-
|
39
|
-
|
40
|
-
Examples:
|
41
|
-
|
42
|
-
cat example_text.txt | #{opts.program_name} # Basic detection
|
43
|
-
|
44
|
-
Languages:
|
45
|
-
|
46
|
-
* ar Arabic
|
47
|
-
* bg Bulgarian
|
48
|
-
* bn Bengali
|
49
|
-
* cs Czech
|
50
|
-
* da Danish
|
51
|
-
* de German
|
52
|
-
* el Greek
|
53
|
-
* en English
|
54
|
-
* es Spanish
|
55
|
-
* et Estonian
|
56
|
-
* fa Persian
|
57
|
-
* fi Finnish
|
58
|
-
* fr French
|
59
|
-
* gu Gujarati
|
60
|
-
* he Hebrew
|
61
|
-
* hi Hindi
|
62
|
-
* hr Croatian
|
63
|
-
* hu Hungarian
|
64
|
-
* id Indonesian
|
65
|
-
* it Italian
|
66
|
-
* ja Japanese
|
67
|
-
* kn Kannada
|
68
|
-
* ko Korean
|
69
|
-
* lt Lithuanian
|
70
|
-
* lv Latvian
|
71
|
-
* mk Macedonian
|
72
|
-
* ml Malayalam
|
73
|
-
* mr Marathi
|
74
|
-
* ne Nepali
|
75
|
-
* nl Dutch
|
76
|
-
* no Norwegian
|
77
|
-
* pa Punjabi
|
78
|
-
* pl Polish
|
79
|
-
* pt Portuguese
|
80
|
-
* ro Romanian
|
81
|
-
* ru Russian
|
82
|
-
* sk Slovak
|
83
|
-
* sl Slovene
|
84
|
-
* so Somali
|
85
|
-
* sq Albanian
|
86
|
-
* sv Swedish
|
87
|
-
* sw Swahili
|
88
|
-
* ta Tamil
|
89
|
-
* te Telugu
|
90
|
-
* th Thai
|
91
|
-
* tl Tagalog
|
92
|
-
* tr Turkish
|
93
|
-
* uk Ukrainian
|
94
|
-
* ur Urdu
|
95
|
-
* vi Vietnamese
|
96
|
-
* zh-cn Simplified Chinese
|
97
|
-
* zh-tw Traditional Chinese
|
41
|
+
cat some_file.kaf | language-identifier
|
98
42
|
EOF
|
99
43
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
opts.on_tail("-h", "--help", "Show this message.") do
|
105
|
-
puts opts
|
106
|
-
exit
|
44
|
+
separator "\nOptions:\n"
|
45
|
+
|
46
|
+
on :v, :version, 'Shows the current version' do
|
47
|
+
abort "language-identifier v#{VERSION} on #{RUBY_DESCRIPTION}"
|
107
48
|
end
|
108
|
-
end
|
109
|
-
end
|
110
49
|
|
111
|
-
|
112
|
-
|
113
|
-
#
|
114
|
-
def run(input)
|
115
|
-
option_parser.parse!(options[:args])
|
116
|
-
identifier = LanguageIdentifier.new(options)
|
50
|
+
on :'no-kaf', 'Disables KAF output'
|
51
|
+
on :p, :probs, 'Displays probabilities instead of a language code'
|
117
52
|
|
118
|
-
|
119
|
-
|
120
|
-
|
53
|
+
run do |opts, args|
|
54
|
+
enable_kaf = true
|
55
|
+
enable_probs = false
|
121
56
|
|
122
|
-
|
57
|
+
if opts[:'no-kaf']
|
58
|
+
enable_kaf = false
|
59
|
+
end
|
123
60
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
abort option_parser.to_s
|
129
|
-
end
|
61
|
+
if opts[:probs]
|
62
|
+
enable_kf = false
|
63
|
+
enable_probs = true
|
64
|
+
end
|
130
65
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
66
|
+
identifier = LanguageIdentifier.new(
|
67
|
+
:args => args,
|
68
|
+
:kaf => enable_kaf,
|
69
|
+
:probs => enable_probs
|
70
|
+
)
|
71
|
+
|
72
|
+
input = STDIN.tty? ? nil : STDIN.read
|
73
|
+
|
74
|
+
puts identifier.run(input)
|
75
|
+
end
|
76
|
+
end
|
136
77
|
end
|
137
78
|
end # CLI
|
138
79
|
end # LanguageIdentifier
|
@@ -4,6 +4,11 @@ import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
|
|
4
4
|
|
5
5
|
module Opener
|
6
6
|
class LanguageIdentifier
|
7
|
+
##
|
8
|
+
# Singleton class wrapped around the Cybozu detector. The Cybozu code uses
|
9
|
+
# the factory pattern and stores a bunch of things on class level. As such
|
10
|
+
# the Cybozu code is *not* thread-safe.
|
11
|
+
#
|
7
12
|
class Detector
|
8
13
|
attr_reader :options
|
9
14
|
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'sinatra/base'
|
2
|
-
require 'httpclient'
|
3
1
|
require 'opener/webservice'
|
4
2
|
|
5
3
|
module Opener
|
@@ -7,27 +5,11 @@ module Opener
|
|
7
5
|
##
|
8
6
|
# A basic language identification server powered by Sinatra.
|
9
7
|
#
|
10
|
-
class Server < Webservice
|
8
|
+
class Server < Opener::Webservice::Server
|
11
9
|
set :views, File.expand_path('../views', __FILE__)
|
12
|
-
text_processor LanguageIdentifier
|
13
|
-
accepted_params :input, :kaf, :benchmark
|
14
10
|
|
15
|
-
|
16
|
-
|
17
|
-
#
|
18
|
-
# @param [Hash] options The options for the text_processor
|
19
|
-
# @return [String] output the output of the text_processor
|
20
|
-
# @return [Symbol] type the output type ot the text_processor
|
21
|
-
#
|
22
|
-
# @raise RunetimeError Raised when the tagging process failed.
|
23
|
-
#
|
24
|
-
def analyze(options)
|
25
|
-
options[:kaf] = true if options[:kaf].nil?
|
26
|
-
processor = text_processor.new(options)
|
27
|
-
output = processor.run(options[:input])
|
28
|
-
|
29
|
-
return output
|
30
|
-
end
|
11
|
+
self.text_processor = LanguageIdentifier
|
12
|
+
self.accepted_params = [:input, :kaf]
|
31
13
|
end # Server
|
32
14
|
end # LanguageIdentifier
|
33
15
|
end # Opener
|
@@ -32,20 +32,10 @@
|
|
32
32
|
<div>
|
33
33
|
<label for="kaf">
|
34
34
|
<input type='hidden' value='false' name='kaf'>
|
35
|
-
<input type="checkbox" name="kaf" id="kaf" checked/>
|
35
|
+
<input type="checkbox" name="kaf" id="kaf" checked />
|
36
36
|
|
37
37
|
Output KAF instead of just the language code
|
38
38
|
</label>
|
39
|
-
|
40
|
-
<br>
|
41
|
-
|
42
|
-
<label for="benchmark">
|
43
|
-
<input type="checkbox" name="benchmark" />
|
44
|
-
|
45
|
-
Include benchmark output in the KAF
|
46
|
-
</label>
|
47
|
-
|
48
|
-
<br>
|
49
39
|
<br>
|
50
40
|
</div>
|
51
41
|
<% 10.times do |t| %>
|
@@ -25,15 +25,12 @@ Gem::Specification.new do |gem|
|
|
25
25
|
|
26
26
|
gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
|
27
27
|
|
28
|
+
gem.add_dependency 'opener-daemons', '~> 2.2'
|
29
|
+
gem.add_dependency 'opener-webservice', '~> 2.1'
|
30
|
+
|
28
31
|
gem.add_dependency 'builder'
|
29
|
-
gem.add_dependency 'puma'
|
30
|
-
gem.add_dependency 'sinatra', '~>1.4.2'
|
31
|
-
gem.add_dependency 'httpclient'
|
32
|
-
gem.add_dependency 'uuidtools'
|
33
|
-
gem.add_dependency 'opener-webservice'
|
34
|
-
gem.add_dependency 'opener-daemons'
|
35
32
|
gem.add_dependency 'nokogiri'
|
36
|
-
gem.add_dependency '
|
33
|
+
gem.add_dependency 'slop', '~> 3.5'
|
37
34
|
|
38
35
|
gem.add_development_dependency 'rspec', '~> 3.0'
|
39
36
|
gem.add_development_dependency 'cucumber'
|
metadata
CHANGED
@@ -1,101 +1,45 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-language-identifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 4.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - '>='
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
requirement: !ruby/object:Gem::Requirement
|
21
|
-
requirements:
|
22
|
-
- - '>='
|
23
|
-
- !ruby/object:Gem::Version
|
24
|
-
version: '0'
|
25
|
-
prerelease: false
|
26
|
-
type: :runtime
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: puma
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - '>='
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
requirement: !ruby/object:Gem::Requirement
|
35
|
-
requirements:
|
36
|
-
- - '>='
|
37
|
-
- !ruby/object:Gem::Version
|
38
|
-
version: '0'
|
39
|
-
prerelease: false
|
40
|
-
type: :runtime
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: sinatra
|
14
|
+
name: opener-daemons
|
43
15
|
version_requirements: !ruby/object:Gem::Requirement
|
44
16
|
requirements:
|
45
17
|
- - ~>
|
46
18
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
19
|
+
version: '2.2'
|
48
20
|
requirement: !ruby/object:Gem::Requirement
|
49
21
|
requirements:
|
50
22
|
- - ~>
|
51
23
|
- !ruby/object:Gem::Version
|
52
|
-
version:
|
53
|
-
prerelease: false
|
54
|
-
type: :runtime
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: httpclient
|
57
|
-
version_requirements: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
requirement: !ruby/object:Gem::Requirement
|
63
|
-
requirements:
|
64
|
-
- - '>='
|
65
|
-
- !ruby/object:Gem::Version
|
66
|
-
version: '0'
|
67
|
-
prerelease: false
|
68
|
-
type: :runtime
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: uuidtools
|
71
|
-
version_requirements: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - '>='
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
requirement: !ruby/object:Gem::Requirement
|
77
|
-
requirements:
|
78
|
-
- - '>='
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
version: '0'
|
24
|
+
version: '2.2'
|
81
25
|
prerelease: false
|
82
26
|
type: :runtime
|
83
27
|
- !ruby/object:Gem::Dependency
|
84
28
|
name: opener-webservice
|
85
29
|
version_requirements: !ruby/object:Gem::Requirement
|
86
30
|
requirements:
|
87
|
-
- -
|
31
|
+
- - ~>
|
88
32
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
33
|
+
version: '2.1'
|
90
34
|
requirement: !ruby/object:Gem::Requirement
|
91
35
|
requirements:
|
92
|
-
- -
|
36
|
+
- - ~>
|
93
37
|
- !ruby/object:Gem::Version
|
94
|
-
version: '
|
38
|
+
version: '2.1'
|
95
39
|
prerelease: false
|
96
40
|
type: :runtime
|
97
41
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
42
|
+
name: builder
|
99
43
|
version_requirements: !ruby/object:Gem::Requirement
|
100
44
|
requirements:
|
101
45
|
- - '>='
|
@@ -123,17 +67,17 @@ dependencies:
|
|
123
67
|
prerelease: false
|
124
68
|
type: :runtime
|
125
69
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
70
|
+
name: slop
|
127
71
|
version_requirements: !ruby/object:Gem::Requirement
|
128
72
|
requirements:
|
129
73
|
- - ~>
|
130
74
|
- !ruby/object:Gem::Version
|
131
|
-
version: '
|
75
|
+
version: '3.5'
|
132
76
|
requirement: !ruby/object:Gem::Requirement
|
133
77
|
requirements:
|
134
78
|
- - ~>
|
135
79
|
- !ruby/object:Gem::Version
|
136
|
-
version: '
|
80
|
+
version: '3.5'
|
137
81
|
prerelease: false
|
138
82
|
type: :runtime
|
139
83
|
- !ruby/object:Gem::Dependency
|