opener-language-identifier 3.1.7 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -3
- data/bin/language-identifier +3 -2
- data/bin/language-identifier-daemon +5 -5
- data/bin/language-identifier-server +6 -4
- data/core/target/LanguageDetection-0.0.1.jar +0 -0
- data/exec/language-identifier.rb +2 -2
- data/lib/opener/language_identifier.rb +8 -8
- data/lib/opener/language_identifier/cli.rb +54 -113
- data/lib/opener/language_identifier/detector.rb +5 -0
- data/lib/opener/language_identifier/server.rb +3 -21
- data/lib/opener/language_identifier/version.rb +1 -1
- data/lib/opener/language_identifier/views/index.erb +1 -11
- data/opener-language-identifier.gemspec +4 -7
- metadata +13 -69
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1c103a6e78b0e47383c82198173460c555eefe19
|
4
|
+
data.tar.gz: 44db5e6da5a34746ef977773fe5bba7b093cd49b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b05e38f97c517b1f4c6527f2d4e941de89e145942aabb2cad675a5938643250a857b37a031896b10f1aad76d553967fe9435c4f9fa956861a1b24419b115fe5
|
7
|
+
data.tar.gz: 8a2aada54780b6f0c0c8f47adf68a0477885ea864e59e088c3d9f4a5e8ca9903ac32d7620c9fac58c78d296b7eb717215ffc9bcbe7d796f88c11385c75d20c0a
|
data/README.md
CHANGED
@@ -4,9 +4,7 @@
|
|
4
4
|
|
5
5
|
The language identifier takes raw text and tries to figure out what language it
|
6
6
|
was written in. The output can either be a plain-text i18n language code or a
|
7
|
-
basic KAF document containing the language and raw input text.
|
8
|
-
|
9
|
-
The output of the language identifier can then be used to drive further text
|
7
|
+
basic KAF document containing the language and raw input text. The output of the language identifier can then be used to drive further text
|
10
8
|
analysis of for example sentiments and or entities.
|
11
9
|
|
12
10
|
## Confused by some terminology?
|
data/bin/language-identifier
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
|
3
3
|
require 'opener/daemons'
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
:name => "language-identifier",
|
9
|
-
:exec_path => exec_path
|
5
|
+
controller = Opener::Daemons::Controller.new(
|
6
|
+
:name => 'opener-language-identifier',
|
7
|
+
:exec_path => File.expand_path("../../exec/language-identifier.rb", __FILE__)
|
10
8
|
)
|
9
|
+
|
10
|
+
controller.run
|
@@ -1,8 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'opener/webservice'
|
4
4
|
|
5
|
-
|
5
|
+
parser = Opener::Webservice::OptionParser.new(
|
6
|
+
'opener-language-identifier',
|
7
|
+
File.expand_path('../../config.ru', __FILE__)
|
8
|
+
)
|
6
9
|
|
7
|
-
|
8
|
-
cli.run
|
10
|
+
parser.run
|
Binary file
|
data/exec/language-identifier.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'opener/daemons'
|
4
|
+
|
4
5
|
require_relative '../lib/opener/language_identifier'
|
5
6
|
|
6
|
-
|
7
|
-
daemon = Opener::Daemons::Daemon.new(Opener::LanguageIdentifier, options)
|
7
|
+
daemon = Opener::Daemons::Daemon.new(Opener::LanguageIdentifier)
|
8
8
|
|
9
9
|
daemon.start
|
@@ -1,8 +1,7 @@
|
|
1
|
+
require 'java'
|
1
2
|
require 'open3'
|
2
|
-
require '
|
3
|
+
require 'slop'
|
3
4
|
require 'builder'
|
4
|
-
require 'java'
|
5
|
-
require 'opener/core'
|
6
5
|
|
7
6
|
require_relative '../../core/target/LanguageDetection-0.0.1.jar'
|
8
7
|
import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
|
@@ -29,8 +28,9 @@ module Opener
|
|
29
28
|
# @return [Hash]
|
30
29
|
#
|
31
30
|
DEFAULT_OPTIONS = {
|
32
|
-
:args
|
33
|
-
:kaf
|
31
|
+
:args => [],
|
32
|
+
:kaf => true,
|
33
|
+
:probs => false
|
34
34
|
}.freeze
|
35
35
|
|
36
36
|
##
|
@@ -42,6 +42,9 @@ module Opener
|
|
42
42
|
# @option options [TrueClass|FalseClass] :kaf When set to `true` the
|
43
43
|
# results will be displayed as KAF.
|
44
44
|
#
|
45
|
+
# @option options [TrueClass|FalseClass] :probs Wen set the probabilities
|
46
|
+
# are returned instead of the language/KAF.
|
47
|
+
#
|
45
48
|
def initialize(options = {})
|
46
49
|
@options = DEFAULT_OPTIONS.merge(options)
|
47
50
|
@detector = Detector.instance
|
@@ -63,9 +66,6 @@ module Opener
|
|
63
66
|
end
|
64
67
|
|
65
68
|
return output
|
66
|
-
|
67
|
-
rescue Exception => error
|
68
|
-
return Opener::Core::ErrorLayer.new(input, error.message, self.class).add
|
69
69
|
end
|
70
70
|
|
71
71
|
alias identify run
|
@@ -1,138 +1,79 @@
|
|
1
1
|
module Opener
|
2
2
|
class LanguageIdentifier
|
3
3
|
##
|
4
|
-
# CLI wrapper around {Opener::LanguageIdentifier} using
|
4
|
+
# CLI wrapper around {Opener::LanguageIdentifier} using Slop.
|
5
5
|
#
|
6
|
-
# @!attribute [r]
|
7
|
-
# @return [
|
8
|
-
#
|
9
|
-
# @!attribute [r] option_parser
|
10
|
-
# @return [OptionParser]
|
6
|
+
# @!attribute [r] parser
|
7
|
+
# @return [Slop]
|
11
8
|
#
|
12
9
|
class CLI
|
13
|
-
attr_reader :
|
10
|
+
attr_reader :parser
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@parser = configure_slop
|
14
|
+
end
|
14
15
|
|
15
16
|
##
|
16
|
-
# @param [
|
17
|
+
# @param [Array] argv
|
17
18
|
#
|
18
|
-
def
|
19
|
-
|
19
|
+
def run(argv = ARGV)
|
20
|
+
parser.parse(argv)
|
21
|
+
end
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
23
|
+
##
|
24
|
+
# @return [Slop]
|
25
|
+
#
|
26
|
+
def configure_slop
|
27
|
+
return Slop.new(:strict => false, :indent => 2, :help => true) do
|
28
|
+
banner 'Usage: language-identifier [OPTIONS]'
|
24
29
|
|
25
|
-
|
26
|
-
show_version
|
27
|
-
end
|
30
|
+
separator <<-EOF.chomp
|
28
31
|
|
29
|
-
|
30
|
-
@options[:kaf] = v
|
31
|
-
end
|
32
|
+
About:
|
32
33
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
34
|
+
Language detection for various languages such as English and Dutch. This
|
35
|
+
command reads input from STDIN. Output can be a language code as plain text,
|
36
|
+
a KAF document containing the input text and language code, or a list of
|
37
|
+
probabilities.
|
38
|
+
|
39
|
+
Example:
|
37
40
|
|
38
|
-
|
39
|
-
|
40
|
-
Examples:
|
41
|
-
|
42
|
-
cat example_text.txt | #{opts.program_name} # Basic detection
|
43
|
-
|
44
|
-
Languages:
|
45
|
-
|
46
|
-
* ar Arabic
|
47
|
-
* bg Bulgarian
|
48
|
-
* bn Bengali
|
49
|
-
* cs Czech
|
50
|
-
* da Danish
|
51
|
-
* de German
|
52
|
-
* el Greek
|
53
|
-
* en English
|
54
|
-
* es Spanish
|
55
|
-
* et Estonian
|
56
|
-
* fa Persian
|
57
|
-
* fi Finnish
|
58
|
-
* fr French
|
59
|
-
* gu Gujarati
|
60
|
-
* he Hebrew
|
61
|
-
* hi Hindi
|
62
|
-
* hr Croatian
|
63
|
-
* hu Hungarian
|
64
|
-
* id Indonesian
|
65
|
-
* it Italian
|
66
|
-
* ja Japanese
|
67
|
-
* kn Kannada
|
68
|
-
* ko Korean
|
69
|
-
* lt Lithuanian
|
70
|
-
* lv Latvian
|
71
|
-
* mk Macedonian
|
72
|
-
* ml Malayalam
|
73
|
-
* mr Marathi
|
74
|
-
* ne Nepali
|
75
|
-
* nl Dutch
|
76
|
-
* no Norwegian
|
77
|
-
* pa Punjabi
|
78
|
-
* pl Polish
|
79
|
-
* pt Portuguese
|
80
|
-
* ro Romanian
|
81
|
-
* ru Russian
|
82
|
-
* sk Slovak
|
83
|
-
* sl Slovene
|
84
|
-
* so Somali
|
85
|
-
* sq Albanian
|
86
|
-
* sv Swedish
|
87
|
-
* sw Swahili
|
88
|
-
* ta Tamil
|
89
|
-
* te Telugu
|
90
|
-
* th Thai
|
91
|
-
* tl Tagalog
|
92
|
-
* tr Turkish
|
93
|
-
* uk Ukrainian
|
94
|
-
* ur Urdu
|
95
|
-
* vi Vietnamese
|
96
|
-
* zh-cn Simplified Chinese
|
97
|
-
* zh-tw Traditional Chinese
|
41
|
+
cat some_file.kaf | language-identifier
|
98
42
|
EOF
|
99
43
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
opts.on_tail("-h", "--help", "Show this message.") do
|
105
|
-
puts opts
|
106
|
-
exit
|
44
|
+
separator "\nOptions:\n"
|
45
|
+
|
46
|
+
on :v, :version, 'Shows the current version' do
|
47
|
+
abort "language-identifier v#{VERSION} on #{RUBY_DESCRIPTION}"
|
107
48
|
end
|
108
|
-
end
|
109
|
-
end
|
110
49
|
|
111
|
-
|
112
|
-
|
113
|
-
#
|
114
|
-
def run(input)
|
115
|
-
option_parser.parse!(options[:args])
|
116
|
-
identifier = LanguageIdentifier.new(options)
|
50
|
+
on :'no-kaf', 'Disables KAF output'
|
51
|
+
on :p, :probs, 'Displays probabilities instead of a language code'
|
117
52
|
|
118
|
-
|
119
|
-
|
120
|
-
|
53
|
+
run do |opts, args|
|
54
|
+
enable_kaf = true
|
55
|
+
enable_probs = false
|
121
56
|
|
122
|
-
|
57
|
+
if opts[:'no-kaf']
|
58
|
+
enable_kaf = false
|
59
|
+
end
|
123
60
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
abort option_parser.to_s
|
129
|
-
end
|
61
|
+
if opts[:probs]
|
62
|
+
enable_kf = false
|
63
|
+
enable_probs = true
|
64
|
+
end
|
130
65
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
66
|
+
identifier = LanguageIdentifier.new(
|
67
|
+
:args => args,
|
68
|
+
:kaf => enable_kaf,
|
69
|
+
:probs => enable_probs
|
70
|
+
)
|
71
|
+
|
72
|
+
input = STDIN.tty? ? nil : STDIN.read
|
73
|
+
|
74
|
+
puts identifier.run(input)
|
75
|
+
end
|
76
|
+
end
|
136
77
|
end
|
137
78
|
end # CLI
|
138
79
|
end # LanguageIdentifier
|
@@ -4,6 +4,11 @@ import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
|
|
4
4
|
|
5
5
|
module Opener
|
6
6
|
class LanguageIdentifier
|
7
|
+
##
|
8
|
+
# Singleton class wrapped around the Cybozu detector. The Cybozu code uses
|
9
|
+
# the factory pattern and stores a bunch of things on class level. As such
|
10
|
+
# the Cybozu code is *not* thread-safe.
|
11
|
+
#
|
7
12
|
class Detector
|
8
13
|
attr_reader :options
|
9
14
|
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'sinatra/base'
|
2
|
-
require 'httpclient'
|
3
1
|
require 'opener/webservice'
|
4
2
|
|
5
3
|
module Opener
|
@@ -7,27 +5,11 @@ module Opener
|
|
7
5
|
##
|
8
6
|
# A basic language identification server powered by Sinatra.
|
9
7
|
#
|
10
|
-
class Server < Webservice
|
8
|
+
class Server < Opener::Webservice::Server
|
11
9
|
set :views, File.expand_path('../views', __FILE__)
|
12
|
-
text_processor LanguageIdentifier
|
13
|
-
accepted_params :input, :kaf, :benchmark
|
14
10
|
|
15
|
-
|
16
|
-
|
17
|
-
#
|
18
|
-
# @param [Hash] options The options for the text_processor
|
19
|
-
# @return [String] output the output of the text_processor
|
20
|
-
# @return [Symbol] type the output type ot the text_processor
|
21
|
-
#
|
22
|
-
# @raise RunetimeError Raised when the tagging process failed.
|
23
|
-
#
|
24
|
-
def analyze(options)
|
25
|
-
options[:kaf] = true if options[:kaf].nil?
|
26
|
-
processor = text_processor.new(options)
|
27
|
-
output = processor.run(options[:input])
|
28
|
-
|
29
|
-
return output
|
30
|
-
end
|
11
|
+
self.text_processor = LanguageIdentifier
|
12
|
+
self.accepted_params = [:input, :kaf]
|
31
13
|
end # Server
|
32
14
|
end # LanguageIdentifier
|
33
15
|
end # Opener
|
@@ -32,20 +32,10 @@
|
|
32
32
|
<div>
|
33
33
|
<label for="kaf">
|
34
34
|
<input type='hidden' value='false' name='kaf'>
|
35
|
-
<input type="checkbox" name="kaf" id="kaf" checked/>
|
35
|
+
<input type="checkbox" name="kaf" id="kaf" checked />
|
36
36
|
|
37
37
|
Output KAF instead of just the language code
|
38
38
|
</label>
|
39
|
-
|
40
|
-
<br>
|
41
|
-
|
42
|
-
<label for="benchmark">
|
43
|
-
<input type="checkbox" name="benchmark" />
|
44
|
-
|
45
|
-
Include benchmark output in the KAF
|
46
|
-
</label>
|
47
|
-
|
48
|
-
<br>
|
49
39
|
<br>
|
50
40
|
</div>
|
51
41
|
<% 10.times do |t| %>
|
@@ -25,15 +25,12 @@ Gem::Specification.new do |gem|
|
|
25
25
|
|
26
26
|
gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
|
27
27
|
|
28
|
+
gem.add_dependency 'opener-daemons', '~> 2.2'
|
29
|
+
gem.add_dependency 'opener-webservice', '~> 2.1'
|
30
|
+
|
28
31
|
gem.add_dependency 'builder'
|
29
|
-
gem.add_dependency 'puma'
|
30
|
-
gem.add_dependency 'sinatra', '~>1.4.2'
|
31
|
-
gem.add_dependency 'httpclient'
|
32
|
-
gem.add_dependency 'uuidtools'
|
33
|
-
gem.add_dependency 'opener-webservice'
|
34
|
-
gem.add_dependency 'opener-daemons'
|
35
32
|
gem.add_dependency 'nokogiri'
|
36
|
-
gem.add_dependency '
|
33
|
+
gem.add_dependency 'slop', '~> 3.5'
|
37
34
|
|
38
35
|
gem.add_development_dependency 'rspec', '~> 3.0'
|
39
36
|
gem.add_development_dependency 'cucumber'
|
metadata
CHANGED
@@ -1,101 +1,45 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-language-identifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 4.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - '>='
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
requirement: !ruby/object:Gem::Requirement
|
21
|
-
requirements:
|
22
|
-
- - '>='
|
23
|
-
- !ruby/object:Gem::Version
|
24
|
-
version: '0'
|
25
|
-
prerelease: false
|
26
|
-
type: :runtime
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: puma
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - '>='
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
requirement: !ruby/object:Gem::Requirement
|
35
|
-
requirements:
|
36
|
-
- - '>='
|
37
|
-
- !ruby/object:Gem::Version
|
38
|
-
version: '0'
|
39
|
-
prerelease: false
|
40
|
-
type: :runtime
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: sinatra
|
14
|
+
name: opener-daemons
|
43
15
|
version_requirements: !ruby/object:Gem::Requirement
|
44
16
|
requirements:
|
45
17
|
- - ~>
|
46
18
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
19
|
+
version: '2.2'
|
48
20
|
requirement: !ruby/object:Gem::Requirement
|
49
21
|
requirements:
|
50
22
|
- - ~>
|
51
23
|
- !ruby/object:Gem::Version
|
52
|
-
version:
|
53
|
-
prerelease: false
|
54
|
-
type: :runtime
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: httpclient
|
57
|
-
version_requirements: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
requirement: !ruby/object:Gem::Requirement
|
63
|
-
requirements:
|
64
|
-
- - '>='
|
65
|
-
- !ruby/object:Gem::Version
|
66
|
-
version: '0'
|
67
|
-
prerelease: false
|
68
|
-
type: :runtime
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: uuidtools
|
71
|
-
version_requirements: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - '>='
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
requirement: !ruby/object:Gem::Requirement
|
77
|
-
requirements:
|
78
|
-
- - '>='
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
version: '0'
|
24
|
+
version: '2.2'
|
81
25
|
prerelease: false
|
82
26
|
type: :runtime
|
83
27
|
- !ruby/object:Gem::Dependency
|
84
28
|
name: opener-webservice
|
85
29
|
version_requirements: !ruby/object:Gem::Requirement
|
86
30
|
requirements:
|
87
|
-
- -
|
31
|
+
- - ~>
|
88
32
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
33
|
+
version: '2.1'
|
90
34
|
requirement: !ruby/object:Gem::Requirement
|
91
35
|
requirements:
|
92
|
-
- -
|
36
|
+
- - ~>
|
93
37
|
- !ruby/object:Gem::Version
|
94
|
-
version: '
|
38
|
+
version: '2.1'
|
95
39
|
prerelease: false
|
96
40
|
type: :runtime
|
97
41
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
42
|
+
name: builder
|
99
43
|
version_requirements: !ruby/object:Gem::Requirement
|
100
44
|
requirements:
|
101
45
|
- - '>='
|
@@ -123,17 +67,17 @@ dependencies:
|
|
123
67
|
prerelease: false
|
124
68
|
type: :runtime
|
125
69
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
70
|
+
name: slop
|
127
71
|
version_requirements: !ruby/object:Gem::Requirement
|
128
72
|
requirements:
|
129
73
|
- - ~>
|
130
74
|
- !ruby/object:Gem::Version
|
131
|
-
version: '
|
75
|
+
version: '3.5'
|
132
76
|
requirement: !ruby/object:Gem::Requirement
|
133
77
|
requirements:
|
134
78
|
- - ~>
|
135
79
|
- !ruby/object:Gem::Version
|
136
|
-
version: '
|
80
|
+
version: '3.5'
|
137
81
|
prerelease: false
|
138
82
|
type: :runtime
|
139
83
|
- !ruby/object:Gem::Dependency
|