opener-language-identifier 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +138 -0
- data/bin/language-identifier +6 -0
- data/bin/language-identifier-daemon +10 -0
- data/bin/language-identifier-server +8 -0
- data/config.ru +4 -0
- data/core/target/LanguageDetection-0.0.1.jar +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/Command.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/Detector.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/ErrorCode.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/GenProfile.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/LangDetectException.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/Language.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/Messages.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/NGram.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/messages.properties +128 -0
- data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class +0 -0
- data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class +0 -0
- data/exec/language-identifier.rb +9 -0
- data/lib/opener/language_identifier.rb +89 -0
- data/lib/opener/language_identifier/cli.rb +139 -0
- data/lib/opener/language_identifier/detector.rb +36 -0
- data/lib/opener/language_identifier/kaf_builder.rb +62 -0
- data/lib/opener/language_identifier/public/markdown.css +283 -0
- data/lib/opener/language_identifier/server.rb +32 -0
- data/lib/opener/language_identifier/version.rb +5 -0
- data/lib/opener/language_identifier/views/index.erb +110 -0
- data/lib/opener/language_identifier/views/result.erb +15 -0
- data/opener-language-identifier.gemspec +37 -0
- metadata +231 -0
@@ -0,0 +1,139 @@
|
|
1
|
+
module Opener
|
2
|
+
class LanguageIdentifier
|
3
|
+
##
|
4
|
+
# CLI wrapper around {Opener::LanguageIdentifier} using OptionParser.
|
5
|
+
#
|
6
|
+
# @!attribute [r] options
|
7
|
+
# @return [Hash]
|
8
|
+
#
|
9
|
+
# @!attribute [r] option_parser
|
10
|
+
# @return [OptionParser]
|
11
|
+
#
|
12
|
+
class CLI
|
13
|
+
attr_reader :options, :option_parser
|
14
|
+
|
15
|
+
##
|
16
|
+
# @param [Hash] options
|
17
|
+
#
|
18
|
+
def initialize(options = {})
|
19
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
20
|
+
|
21
|
+
@option_parser = OptionParser.new do |opts|
|
22
|
+
opts.program_name = 'language-identifier'
|
23
|
+
opts.summary_indent = ' '
|
24
|
+
|
25
|
+
opts.on('-v', '--version', 'Shows the current version') do
|
26
|
+
show_version
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on('-k', '--[no-]kaf', 'Output the language as KAF') do |v|
|
30
|
+
@options[:kaf] = v
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on('-p', '--probs', 'Provide probabilities, assumes --no-kaf') do
|
34
|
+
@options[:kaf] = false
|
35
|
+
@options[:probs] = true
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.separator <<-EOF
|
39
|
+
|
40
|
+
Examples:
|
41
|
+
|
42
|
+
cat example_text.txt | #{opts.program_name} # Basic detection
|
43
|
+
|
44
|
+
Languages:
|
45
|
+
|
46
|
+
* ar Arabic
|
47
|
+
* bg Bulgarian
|
48
|
+
* bn Bengali
|
49
|
+
* cs Czech
|
50
|
+
* da Danish
|
51
|
+
* de German
|
52
|
+
* el Greek
|
53
|
+
* en English
|
54
|
+
* es Spanish
|
55
|
+
* et Estonian
|
56
|
+
* fa Persian
|
57
|
+
* fi Finnish
|
58
|
+
* fr French
|
59
|
+
* gu Gujarati
|
60
|
+
* he Hebrew
|
61
|
+
* hi Hindi
|
62
|
+
* hr Croatian
|
63
|
+
* hu Hungarian
|
64
|
+
* id Indonesian
|
65
|
+
* it Italian
|
66
|
+
* ja Japanese
|
67
|
+
* kn Kannada
|
68
|
+
* ko Korean
|
69
|
+
* lt Lithuanian
|
70
|
+
* lv Latvian
|
71
|
+
* mk Macedonian
|
72
|
+
* ml Malayalam
|
73
|
+
* mr Marathi
|
74
|
+
* ne Nepali
|
75
|
+
* nl Dutch
|
76
|
+
* no Norwegian
|
77
|
+
* pa Punjabi
|
78
|
+
* pl Polish
|
79
|
+
* pt Portuguese
|
80
|
+
* ro Romanian
|
81
|
+
* ru Russian
|
82
|
+
* sk Slovak
|
83
|
+
* sl Slovene
|
84
|
+
* so Somali
|
85
|
+
* sq Albanian
|
86
|
+
* sv Swedish
|
87
|
+
* sw Swahili
|
88
|
+
* ta Tamil
|
89
|
+
* te Telugu
|
90
|
+
* th Thai
|
91
|
+
* tl Tagalog
|
92
|
+
* tr Turkish
|
93
|
+
* uk Ukrainian
|
94
|
+
* ur Urdu
|
95
|
+
* vi Vietnamese
|
96
|
+
* zh-cn Simplified Chinese
|
97
|
+
* zh-tw Traditional Chinese
|
98
|
+
EOF
|
99
|
+
|
100
|
+
opts.separator ""
|
101
|
+
opts.separator "Common options:"
|
102
|
+
# No argument, shows at tail. This will print an options summary.
|
103
|
+
# Try it and see!
|
104
|
+
opts.on_tail("-h", "--help", "Show this message.") do
|
105
|
+
puts opts
|
106
|
+
exit
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
##
|
112
|
+
# @param [String] input
|
113
|
+
#
|
114
|
+
def run(input)
|
115
|
+
option_parser.parse!(options[:args])
|
116
|
+
identifier = LanguageIdentifier.new(options)
|
117
|
+
|
118
|
+
output = identifier.run(input)
|
119
|
+
puts output
|
120
|
+
end
|
121
|
+
|
122
|
+
private
|
123
|
+
|
124
|
+
##
|
125
|
+
# Shows the help message and exits the program.
|
126
|
+
#
|
127
|
+
def show_help
|
128
|
+
abort option_parser.to_s
|
129
|
+
end
|
130
|
+
|
131
|
+
##
|
132
|
+
# Shows the version and exits the program.
|
133
|
+
#
|
134
|
+
def show_version
|
135
|
+
abort "#{option_parser.program_name} v#{VERSION} on #{RUBY_DESCRIPTION}"
|
136
|
+
end
|
137
|
+
end # CLI
|
138
|
+
end # LanguageIdentifier
|
139
|
+
end # Opener
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
|
3
|
+
import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
|
4
|
+
|
5
|
+
module Opener
|
6
|
+
class LanguageIdentifier
|
7
|
+
class Detector
|
8
|
+
attr_reader :options
|
9
|
+
|
10
|
+
include Singleton
|
11
|
+
|
12
|
+
def initialize(options={})
|
13
|
+
@options = options
|
14
|
+
@detector = CybozuDetector.new(profiles_path)
|
15
|
+
@semaphore = Mutex.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def detect(input)
|
19
|
+
@semaphore.synchronize do
|
20
|
+
@detector.detect(input)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def probabilities(input)
|
25
|
+
@semaphore.synchronize do
|
26
|
+
result = @detector.detect_langs(input)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def profiles_path
|
31
|
+
default_path = File.expand_path("../../../../core/target/classes/profiles", __FILE__)
|
32
|
+
options.fetch(:profiles_path, default_path)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Opener
|
2
|
+
class LanguageIdentifier
|
3
|
+
##
|
4
|
+
# Class for building basic KAF documents that contain the correct language
|
5
|
+
# tag and the raw input that was anaylzed.
|
6
|
+
#
|
7
|
+
# @!attribute [r] xml
|
8
|
+
# @return [Builder::XmlMarkup]
|
9
|
+
#
|
10
|
+
# @!attribute [r] original_text
|
11
|
+
# @return [String]
|
12
|
+
#
|
13
|
+
# @!attribute [r] language
|
14
|
+
# @return [String]
|
15
|
+
#
|
16
|
+
class KafBuilder
|
17
|
+
attr_reader :xml, :original_text, :language
|
18
|
+
|
19
|
+
##
|
20
|
+
# @param [String] text The input text that was analyzed.
|
21
|
+
# @param [String] language The language of the text.
|
22
|
+
#
|
23
|
+
def initialize(text, language)
|
24
|
+
@xml = Builder::XmlMarkup.new(:indent => 2)
|
25
|
+
@language = language.strip
|
26
|
+
@original_text = text
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Builds the KAF document.
|
31
|
+
#
|
32
|
+
def build
|
33
|
+
xml.instruct!(
|
34
|
+
:xml,
|
35
|
+
:version => '1.0',
|
36
|
+
:encoding => 'UTF-8',
|
37
|
+
:standalone => 'yes'
|
38
|
+
)
|
39
|
+
|
40
|
+
xml.KAF('xml:lang' => language, 'version' => version) do |node|
|
41
|
+
node.raw(original_text)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
##
|
46
|
+
# Returns the XML document as a String.
|
47
|
+
#
|
48
|
+
# @return [String]
|
49
|
+
#
|
50
|
+
def to_s
|
51
|
+
return xml.target!
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# @return [String]
|
56
|
+
#
|
57
|
+
def version
|
58
|
+
return "2.1"
|
59
|
+
end
|
60
|
+
end # KafBuilder
|
61
|
+
end # LanguageIdentifier
|
62
|
+
end # Opener
|
@@ -0,0 +1,283 @@
|
|
1
|
+
input[type="text"], textarea
|
2
|
+
{
|
3
|
+
width: 500px;
|
4
|
+
}
|
5
|
+
|
6
|
+
body {
|
7
|
+
font-family: Helvetica, arial, sans-serif;
|
8
|
+
font-size: 14px;
|
9
|
+
line-height: 1.6;
|
10
|
+
padding-top: 10px;
|
11
|
+
padding-bottom: 10px;
|
12
|
+
background-color: white;
|
13
|
+
padding: 30px; }
|
14
|
+
|
15
|
+
body > *:first-child {
|
16
|
+
margin-top: 0 !important; }
|
17
|
+
body > *:last-child {
|
18
|
+
margin-bottom: 0 !important; }
|
19
|
+
|
20
|
+
a {
|
21
|
+
color: #4183C4; }
|
22
|
+
a.absent {
|
23
|
+
color: #cc0000; }
|
24
|
+
a.anchor {
|
25
|
+
display: block;
|
26
|
+
padding-left: 30px;
|
27
|
+
margin-left: -30px;
|
28
|
+
cursor: pointer;
|
29
|
+
position: absolute;
|
30
|
+
top: 0;
|
31
|
+
left: 0;
|
32
|
+
bottom: 0; }
|
33
|
+
|
34
|
+
h1, h2, h3, h4, h5, h6 {
|
35
|
+
margin: 20px 0 10px;
|
36
|
+
padding: 0;
|
37
|
+
font-weight: bold;
|
38
|
+
-webkit-font-smoothing: antialiased;
|
39
|
+
cursor: text;
|
40
|
+
position: relative; }
|
41
|
+
|
42
|
+
h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor, h5:hover a.anchor, h6:hover a.anchor {
|
43
|
+
background: url("../../images/modules/styleguide/para.png") no-repeat 10px center;
|
44
|
+
text-decoration: none; }
|
45
|
+
|
46
|
+
h1 tt, h1 code {
|
47
|
+
font-size: inherit; }
|
48
|
+
|
49
|
+
h2 tt, h2 code {
|
50
|
+
font-size: inherit; }
|
51
|
+
|
52
|
+
h3 tt, h3 code {
|
53
|
+
font-size: inherit; }
|
54
|
+
|
55
|
+
h4 tt, h4 code {
|
56
|
+
font-size: inherit; }
|
57
|
+
|
58
|
+
h5 tt, h5 code {
|
59
|
+
font-size: inherit; }
|
60
|
+
|
61
|
+
h6 tt, h6 code {
|
62
|
+
font-size: inherit; }
|
63
|
+
|
64
|
+
h1 {
|
65
|
+
font-size: 28px;
|
66
|
+
color: black; }
|
67
|
+
|
68
|
+
h2 {
|
69
|
+
font-size: 24px;
|
70
|
+
border-bottom: 1px solid #cccccc;
|
71
|
+
color: black; }
|
72
|
+
|
73
|
+
h3 {
|
74
|
+
font-size: 18px; }
|
75
|
+
|
76
|
+
h4 {
|
77
|
+
font-size: 16px; }
|
78
|
+
|
79
|
+
h5 {
|
80
|
+
font-size: 14px; }
|
81
|
+
|
82
|
+
h6 {
|
83
|
+
color: #777777;
|
84
|
+
font-size: 14px; }
|
85
|
+
|
86
|
+
p, blockquote, ul, ol, dl, li, table, pre {
|
87
|
+
margin: 15px 0; }
|
88
|
+
|
89
|
+
hr {
|
90
|
+
background: transparent url("../../images/modules/pulls/dirty-shade.png") repeat-x 0 0;
|
91
|
+
border: 0 none;
|
92
|
+
color: #cccccc;
|
93
|
+
height: 4px;
|
94
|
+
padding: 0; }
|
95
|
+
|
96
|
+
body > h2:first-child {
|
97
|
+
margin-top: 0;
|
98
|
+
padding-top: 0; }
|
99
|
+
body > h1:first-child {
|
100
|
+
margin-top: 0;
|
101
|
+
padding-top: 0; }
|
102
|
+
body > h1:first-child + h2 {
|
103
|
+
margin-top: 0;
|
104
|
+
padding-top: 0; }
|
105
|
+
body > h3:first-child, body > h4:first-child, body > h5:first-child, body > h6:first-child {
|
106
|
+
margin-top: 0;
|
107
|
+
padding-top: 0; }
|
108
|
+
|
109
|
+
a:first-child h1, a:first-child h2, a:first-child h3, a:first-child h4, a:first-child h5, a:first-child h6 {
|
110
|
+
margin-top: 0;
|
111
|
+
padding-top: 0; }
|
112
|
+
|
113
|
+
h1 p, h2 p, h3 p, h4 p, h5 p, h6 p {
|
114
|
+
margin-top: 0; }
|
115
|
+
|
116
|
+
li p.first {
|
117
|
+
display: inline-block; }
|
118
|
+
|
119
|
+
ul, ol {
|
120
|
+
padding-left: 30px; }
|
121
|
+
|
122
|
+
ul :first-child, ol :first-child {
|
123
|
+
margin-top: 0; }
|
124
|
+
|
125
|
+
ul :last-child, ol :last-child {
|
126
|
+
margin-bottom: 0; }
|
127
|
+
|
128
|
+
dl {
|
129
|
+
padding: 0; }
|
130
|
+
dl dt {
|
131
|
+
font-size: 14px;
|
132
|
+
font-weight: bold;
|
133
|
+
font-style: italic;
|
134
|
+
padding: 0;
|
135
|
+
margin: 15px 0 5px; }
|
136
|
+
dl dt:first-child {
|
137
|
+
padding: 0; }
|
138
|
+
dl dt > :first-child {
|
139
|
+
margin-top: 0; }
|
140
|
+
dl dt > :last-child {
|
141
|
+
margin-bottom: 0; }
|
142
|
+
dl dd {
|
143
|
+
margin: 0 0 15px;
|
144
|
+
padding: 0 15px; }
|
145
|
+
dl dd > :first-child {
|
146
|
+
margin-top: 0; }
|
147
|
+
dl dd > :last-child {
|
148
|
+
margin-bottom: 0; }
|
149
|
+
|
150
|
+
blockquote {
|
151
|
+
border-left: 4px solid #dddddd;
|
152
|
+
padding: 0 15px;
|
153
|
+
color: #777777; }
|
154
|
+
blockquote > :first-child {
|
155
|
+
margin-top: 0; }
|
156
|
+
blockquote > :last-child {
|
157
|
+
margin-bottom: 0; }
|
158
|
+
|
159
|
+
table {
|
160
|
+
padding: 0; }
|
161
|
+
table tr {
|
162
|
+
border-top: 1px solid #cccccc;
|
163
|
+
background-color: white;
|
164
|
+
margin: 0;
|
165
|
+
padding: 0; }
|
166
|
+
table tr:nth-child(2n) {
|
167
|
+
background-color: #f8f8f8; }
|
168
|
+
table tr th {
|
169
|
+
font-weight: bold;
|
170
|
+
border: 1px solid #cccccc;
|
171
|
+
text-align: left;
|
172
|
+
margin: 0;
|
173
|
+
padding: 6px 13px; }
|
174
|
+
table tr td {
|
175
|
+
border: 1px solid #cccccc;
|
176
|
+
text-align: left;
|
177
|
+
margin: 0;
|
178
|
+
padding: 6px 13px; }
|
179
|
+
table tr th :first-child, table tr td :first-child {
|
180
|
+
margin-top: 0; }
|
181
|
+
table tr th :last-child, table tr td :last-child {
|
182
|
+
margin-bottom: 0; }
|
183
|
+
|
184
|
+
img {
|
185
|
+
max-width: 100%; }
|
186
|
+
|
187
|
+
span.frame {
|
188
|
+
display: block;
|
189
|
+
overflow: hidden; }
|
190
|
+
span.frame > span {
|
191
|
+
border: 1px solid #dddddd;
|
192
|
+
display: block;
|
193
|
+
float: left;
|
194
|
+
overflow: hidden;
|
195
|
+
margin: 13px 0 0;
|
196
|
+
padding: 7px;
|
197
|
+
width: auto; }
|
198
|
+
span.frame span img {
|
199
|
+
display: block;
|
200
|
+
float: left; }
|
201
|
+
span.frame span span {
|
202
|
+
clear: both;
|
203
|
+
color: #333333;
|
204
|
+
display: block;
|
205
|
+
padding: 5px 0 0; }
|
206
|
+
span.align-center {
|
207
|
+
display: block;
|
208
|
+
overflow: hidden;
|
209
|
+
clear: both; }
|
210
|
+
span.align-center > span {
|
211
|
+
display: block;
|
212
|
+
overflow: hidden;
|
213
|
+
margin: 13px auto 0;
|
214
|
+
text-align: center; }
|
215
|
+
span.align-center span img {
|
216
|
+
margin: 0 auto;
|
217
|
+
text-align: center; }
|
218
|
+
span.align-right {
|
219
|
+
display: block;
|
220
|
+
overflow: hidden;
|
221
|
+
clear: both; }
|
222
|
+
span.align-right > span {
|
223
|
+
display: block;
|
224
|
+
overflow: hidden;
|
225
|
+
margin: 13px 0 0;
|
226
|
+
text-align: right; }
|
227
|
+
span.align-right span img {
|
228
|
+
margin: 0;
|
229
|
+
text-align: right; }
|
230
|
+
span.float-left {
|
231
|
+
display: block;
|
232
|
+
margin-right: 13px;
|
233
|
+
overflow: hidden;
|
234
|
+
float: left; }
|
235
|
+
span.float-left span {
|
236
|
+
margin: 13px 0 0; }
|
237
|
+
span.float-right {
|
238
|
+
display: block;
|
239
|
+
margin-left: 13px;
|
240
|
+
overflow: hidden;
|
241
|
+
float: right; }
|
242
|
+
span.float-right > span {
|
243
|
+
display: block;
|
244
|
+
overflow: hidden;
|
245
|
+
margin: 13px auto 0;
|
246
|
+
text-align: right; }
|
247
|
+
|
248
|
+
code, tt {
|
249
|
+
margin: 0 2px;
|
250
|
+
padding: 0 5px;
|
251
|
+
white-space: nowrap;
|
252
|
+
border: 1px solid #eaeaea;
|
253
|
+
background-color: #f8f8f8;
|
254
|
+
border-radius: 3px; }
|
255
|
+
|
256
|
+
pre code {
|
257
|
+
margin: 0;
|
258
|
+
padding: 0;
|
259
|
+
white-space: pre;
|
260
|
+
border: none;
|
261
|
+
background: transparent; }
|
262
|
+
|
263
|
+
.highlight pre {
|
264
|
+
background-color: #f8f8f8;
|
265
|
+
border: 1px solid #cccccc;
|
266
|
+
font-size: 13px;
|
267
|
+
line-height: 19px;
|
268
|
+
overflow: auto;
|
269
|
+
padding: 6px 10px;
|
270
|
+
border-radius: 3px; }
|
271
|
+
|
272
|
+
pre {
|
273
|
+
background-color: #f8f8f8;
|
274
|
+
border: 1px solid #cccccc;
|
275
|
+
font-size: 13px;
|
276
|
+
line-height: 19px;
|
277
|
+
overflow: auto;
|
278
|
+
padding: 6px 10px;
|
279
|
+
border-radius: 3px; }
|
280
|
+
pre code, pre tt {
|
281
|
+
background-color: transparent;
|
282
|
+
border: none; }
|
283
|
+
|