language_detection 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +2 -0
- data/README.md +5 -5
- data/Rakefile +15 -2
- data/ext/cld/Makefile +1 -1
- data/language_detection.gemspec +14 -9
- data/lib/language_detection/language.rb +18 -0
- data/lib/language_detection/version.rb +1 -1
- data/lib/language_detection.rb +8 -9
- data/test/_helper.rb +2 -3
- data/test/language_detection_test.rb +12 -13
- metadata +45 -74
- data/ext/cld/cld.so +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 4458c90ef41ad87f046ea204849c94f68b7c770f1ff210a53135e70104b10b14
|
4
|
+
data.tar.gz: 931f7cbbae50ee8d4e2aa119c41a610ee222c6fc50722e7ec2637517b1b891d4
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e0d50b6ee77faf6c3c7b29254f9d3c80e822652530ba3b1f5d44ca944f9d05167841929e03e3f62208d67db43a8542432f42ce0931f37dd5140fdbb154be01c5
|
7
|
+
data.tar.gz: 82dee57902451034039a8445ada9e712319d12c75272e73da90ed04ddcc1c22f491e1be557f13032169f0e4dd3b647999c3b4a8e081bdff1aa5412dcb9fbc69a
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -22,15 +22,15 @@ Or install it yourself as:
|
|
22
22
|
>> require 'language_detection'
|
23
23
|
=> true
|
24
24
|
>> language = LanguageDetection.perform("This is some example text for language detection")
|
25
|
-
=>
|
25
|
+
=> #<LanguageDetection::Language:0x007fae0404f628 @name="english", @code="en", @reliable=true, @text_bytes=51, @details=[#<LanguageDetection::Language:0x007fae0404eb10 @name="english", @code="en", @details=[], @percent=100, @score=49.43273905996759>]>
|
26
26
|
>> language.name
|
27
|
-
=> "
|
27
|
+
=> "english"
|
28
28
|
>> language.code
|
29
29
|
=> "en"
|
30
30
|
>> language.reliable
|
31
31
|
=> true
|
32
32
|
>> language.details # contains up to 3 languages sorted by score
|
33
|
-
=> [
|
33
|
+
=> [#<LanguageDetection::Language:0x007fae0404eb10 @name="english", @code="en", @details=[], @percent=100, @score=49.43273905996759>]
|
34
34
|
>> language.details.first.percent
|
35
35
|
=> 100
|
36
36
|
>> language.details.first.score
|
@@ -61,7 +61,7 @@ which provides `Article#language` method using `Article#to_s` method as paramete
|
|
61
61
|
```ruby
|
62
62
|
>> article = Article.new :title => "Web development that doesn't hurt", :content => "Tens of thousands of Rails applications are already live..."
|
63
63
|
>> article.language
|
64
|
-
=>
|
64
|
+
=> #<LanguageDetection::Language:0x007fae049dd8e8 @name="english", @code="en", @reliable=true, @text_bytes=93, @details=[#<LanguageDetection::Language:0x007fae049dd118 @name="english", @code="en", @details=[], @percent=100, @score=80.22690437601297>]>
|
65
65
|
```
|
66
66
|
|
67
67
|
or you can add `String#language` method by `require 'language_detection/string'`
|
@@ -72,7 +72,7 @@ or you can add `String#language` method by `require 'language_detection/string'`
|
|
72
72
|
>> require 'language_detection/string'
|
73
73
|
=> true
|
74
74
|
>> "Web development that doesn't hurt".language
|
75
|
-
=>
|
75
|
+
=> #<LanguageDetection::Language:0x007fae049cfec8 @name="english", @code="en", @reliable=true, @text_bytes=36, @details=[#<LanguageDetection::Language:0x007fae049cf7e8 @name="english", @code="en", @details=[], @percent=100, @score=39.70826580226905>]>
|
76
76
|
```
|
77
77
|
|
78
78
|
|
data/Rakefile
CHANGED
@@ -1,11 +1,24 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
+
require 'rake/testtask'
|
2
3
|
|
3
4
|
task :default => :test
|
4
5
|
|
5
|
-
|
6
|
+
desc "Compile extension"
|
7
|
+
task :compile do
|
8
|
+
path = File.expand_path("ext/cld/cld.so", File.dirname(__FILE__))
|
9
|
+
|
10
|
+
if !File.exist?(path) || ENV['RECOMPILE']
|
11
|
+
puts "Compiling extension..."
|
12
|
+
`cd #{File.expand_path("ext/cld/")} && make`
|
13
|
+
else
|
14
|
+
puts "Extension already compiled. To recompile set env variable RECOMPILE=true."
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
6
18
|
Rake::TestTask.new(:test) do |test|
|
19
|
+
Rake::Task["compile"].invoke
|
20
|
+
|
7
21
|
test.libs << 'lib' << 'test'
|
8
22
|
test.test_files = FileList['test/*_test.rb']
|
9
23
|
test.verbose = true
|
10
|
-
# test.warning = true
|
11
24
|
end
|
data/ext/cld/Makefile
CHANGED
data/language_detection.gemspec
CHANGED
@@ -7,10 +7,14 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.name = "language_detection"
|
8
8
|
gem.version = LanguageDetection::VERSION
|
9
9
|
gem.authors = ["Vojtech Hyza"]
|
10
|
+
gem.license = 'MIT'
|
10
11
|
gem.email = ["vhyza@vhyza.eu"]
|
11
|
-
gem.description = %q{Language
|
12
|
-
gem.summary =
|
13
|
-
|
12
|
+
gem.description = %q{Ruby bindings for Chromium Compact Language Detector}
|
13
|
+
gem.summary = <<-EOF
|
14
|
+
Ruby bindings for Chromium Compact Language Detector ([source](http://src.chromium.org/viewvc/chrome/trunk/src/third_party/cld/)).
|
15
|
+
This gem is using source codes from [chromium-compact-language-detector](http://code.google.com/p/chromium-compact-language-detector/) port.
|
16
|
+
EOF
|
17
|
+
gem.homepage = "https://github.com/vhyza/language_detection"
|
14
18
|
|
15
19
|
gem.files = `git ls-files`.split($/)
|
16
20
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
@@ -18,11 +22,12 @@ Gem::Specification.new do |gem|
|
|
18
22
|
gem.require_paths = ["lib"]
|
19
23
|
gem.extensions = ["ext/cld/extconf.rb"]
|
20
24
|
|
21
|
-
gem.
|
22
|
-
gem.add_dependency "hashr"
|
23
|
-
gem.add_dependency "rake"
|
25
|
+
gem.add_runtime_dependency "ffi", "~> 1.12"
|
24
26
|
|
25
|
-
gem.add_development_dependency "
|
26
|
-
gem.add_development_dependency "
|
27
|
-
gem.add_development_dependency "
|
27
|
+
gem.add_development_dependency "rake", "~> 13"
|
28
|
+
gem.add_development_dependency "shoulda", "~> 4"
|
29
|
+
gem.add_development_dependency "mocha", "~> 2"
|
30
|
+
gem.add_development_dependency "test-unit", "~> 3"
|
31
|
+
|
32
|
+
gem.required_ruby_version = [ ">= 2.5.0", "< 3.3.0" ]
|
28
33
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module LanguageDetection
|
2
|
+
|
3
|
+
class Language
|
4
|
+
|
5
|
+
attr_accessor :name, :code, :reliable, :text_bytes, :details, :percent, :score
|
6
|
+
|
7
|
+
def initialize(attributes = {})
|
8
|
+
attributes.each_pair do |attribute, value|
|
9
|
+
self.send("#{attribute}=", value)
|
10
|
+
end
|
11
|
+
|
12
|
+
@details ||= []
|
13
|
+
@name.downcase!
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
data/lib/language_detection.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require "language_detection/version"
|
2
|
+
require "language_detection/language"
|
2
3
|
require "ffi"
|
3
|
-
require "hashr"
|
4
4
|
|
5
5
|
module LanguageDetection
|
6
6
|
|
@@ -10,12 +10,11 @@ module LanguageDetection
|
|
10
10
|
result = language_detection(text.to_s, is_plain_text)
|
11
11
|
|
12
12
|
language = parse_result(result, result.members - [:details])
|
13
|
-
language[:details] = []
|
14
13
|
|
15
|
-
details = FFI::Pointer.new(LanguageDetection::
|
14
|
+
details = FFI::Pointer.new(LanguageDetection::DetailStruct, result[:details])
|
16
15
|
3.times do |i|
|
17
|
-
detail = parse_result(LanguageDetection::
|
18
|
-
language
|
16
|
+
detail = parse_result(LanguageDetection::DetailStruct.new(details[i]))
|
17
|
+
language.details << detail unless detail.code == 'un'
|
19
18
|
end
|
20
19
|
|
21
20
|
language
|
@@ -28,19 +27,19 @@ module LanguageDetection
|
|
28
27
|
private
|
29
28
|
|
30
29
|
def self.parse_result(result, members = result.members)
|
31
|
-
|
30
|
+
Language.new(Hash[ members.map {|member| [member.to_sym, result[member]]} ])
|
32
31
|
end
|
33
32
|
|
34
33
|
extend FFI::Library
|
35
34
|
|
36
|
-
class
|
35
|
+
class DetailStruct < FFI::Struct
|
37
36
|
layout :name, :string,
|
38
37
|
:code, :string,
|
39
38
|
:percent, :int,
|
40
39
|
:score, :double
|
41
40
|
end
|
42
41
|
|
43
|
-
class
|
42
|
+
class LanguageStruct < FFI::Struct
|
44
43
|
layout :name, :string,
|
45
44
|
:code, :string,
|
46
45
|
:reliable, :bool,
|
@@ -49,6 +48,6 @@ module LanguageDetection
|
|
49
48
|
end
|
50
49
|
|
51
50
|
ffi_lib File.expand_path("../../ext/cld/cld.so", __FILE__)
|
52
|
-
attach_function "language_detection","language_detection", [:buffer_in, :bool],
|
51
|
+
attach_function "language_detection","language_detection", [:buffer_in, :bool], LanguageStruct.by_value
|
53
52
|
|
54
53
|
end
|
data/test/_helper.rb
CHANGED
@@ -2,8 +2,7 @@ require 'bundler/setup'
|
|
2
2
|
|
3
3
|
require 'test/unit'
|
4
4
|
require 'shoulda'
|
5
|
-
require '
|
6
|
-
require 'mocha'
|
5
|
+
require 'mocha/test_unit'
|
7
6
|
require File.join(File.expand_path('../../lib/language_detection.rb', __FILE__))
|
8
7
|
|
9
8
|
class Test::Unit::TestCase
|
@@ -12,4 +11,4 @@ class Test::Unit::TestCase
|
|
12
11
|
File.read File.expand_path("../fixtures/#{name}", __FILE__)
|
13
12
|
end
|
14
13
|
|
15
|
-
end
|
14
|
+
end
|
@@ -1,30 +1,29 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require '_helper'
|
3
|
+
require './test/_helper'
|
4
4
|
require 'csv'
|
5
5
|
|
6
6
|
class LanguageDetectionTest < Test::Unit::TestCase
|
7
7
|
|
8
8
|
context "Language detection" do
|
9
9
|
|
10
|
-
should "be able to convert result from native call to
|
10
|
+
should "be able to convert result from native call to Language instance" do
|
11
11
|
result = LanguageDetection.language_detection("this is some text", false)
|
12
12
|
parsed_result = LanguageDetection.parse_result(result)
|
13
13
|
|
14
|
-
assert_kind_of LanguageDetection::
|
15
|
-
assert_kind_of
|
14
|
+
assert_kind_of LanguageDetection::LanguageStruct, result
|
15
|
+
assert_kind_of LanguageDetection::Language, parsed_result
|
16
16
|
|
17
|
-
assert_equal "
|
18
|
-
assert_nil parsed_result.non_existing_property
|
17
|
+
assert_equal "english", parsed_result.name
|
19
18
|
end
|
20
19
|
|
21
|
-
should "convert details from FFI pointer to
|
20
|
+
should "convert details from FFI pointer to Language instance" do
|
22
21
|
language = LanguageDetection.perform("this is some text")
|
23
22
|
|
24
|
-
assert_kind_of Array,
|
25
|
-
assert_kind_of
|
26
|
-
assert_equal "
|
27
|
-
assert_equal 65,
|
23
|
+
assert_kind_of Array, language.details
|
24
|
+
assert_kind_of LanguageDetection::Language, language.details.first
|
25
|
+
assert_equal "english", language.details.first.name
|
26
|
+
assert_equal 65, language.details.first.percent
|
28
27
|
end
|
29
28
|
|
30
29
|
should "recognize languages in testing data" do
|
@@ -67,7 +66,7 @@ class LanguageDetectionTest < Test::Unit::TestCase
|
|
67
66
|
|
68
67
|
should "return detected language" do
|
69
68
|
language = @article.language
|
70
|
-
assert_equal "
|
69
|
+
assert_equal "english", language.name
|
71
70
|
assert_equal true, language.reliable
|
72
71
|
assert_equal 100, language.details.first.percent
|
73
72
|
end
|
@@ -85,4 +84,4 @@ class LanguageDetectionTest < Test::Unit::TestCase
|
|
85
84
|
end
|
86
85
|
|
87
86
|
|
88
|
-
end
|
87
|
+
end
|
metadata
CHANGED
@@ -1,113 +1,86 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: language_detection
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Vojtech Hyza
|
9
|
-
autorequire:
|
8
|
+
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2022-12-22 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: ffi
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - "~>"
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version: '
|
19
|
+
version: '1.12'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - "~>"
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version: '
|
30
|
-
- !ruby/object:Gem::Dependency
|
31
|
-
name: hashr
|
32
|
-
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
|
-
requirements:
|
35
|
-
- - ! '>='
|
36
|
-
- !ruby/object:Gem::Version
|
37
|
-
version: '0'
|
38
|
-
type: :runtime
|
39
|
-
prerelease: false
|
40
|
-
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
|
-
requirements:
|
43
|
-
- - ! '>='
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
version: '0'
|
26
|
+
version: '1.12'
|
46
27
|
- !ruby/object:Gem::Dependency
|
47
28
|
name: rake
|
48
29
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
30
|
requirements:
|
51
|
-
- -
|
31
|
+
- - "~>"
|
52
32
|
- !ruby/object:Gem::Version
|
53
|
-
version: '
|
54
|
-
type: :
|
33
|
+
version: '13'
|
34
|
+
type: :development
|
55
35
|
prerelease: false
|
56
36
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
37
|
requirements:
|
59
|
-
- -
|
38
|
+
- - "~>"
|
60
39
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
40
|
+
version: '13'
|
62
41
|
- !ruby/object:Gem::Dependency
|
63
42
|
name: shoulda
|
64
43
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
44
|
requirements:
|
67
|
-
- -
|
45
|
+
- - "~>"
|
68
46
|
- !ruby/object:Gem::Version
|
69
|
-
version: '
|
47
|
+
version: '4'
|
70
48
|
type: :development
|
71
49
|
prerelease: false
|
72
50
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
51
|
requirements:
|
75
|
-
- -
|
52
|
+
- - "~>"
|
76
53
|
- !ruby/object:Gem::Version
|
77
|
-
version: '
|
54
|
+
version: '4'
|
78
55
|
- !ruby/object:Gem::Dependency
|
79
56
|
name: mocha
|
80
57
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
58
|
requirements:
|
83
|
-
- -
|
59
|
+
- - "~>"
|
84
60
|
- !ruby/object:Gem::Version
|
85
|
-
version: '
|
61
|
+
version: '2'
|
86
62
|
type: :development
|
87
63
|
prerelease: false
|
88
64
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
65
|
requirements:
|
91
|
-
- -
|
66
|
+
- - "~>"
|
92
67
|
- !ruby/object:Gem::Version
|
93
|
-
version: '
|
68
|
+
version: '2'
|
94
69
|
- !ruby/object:Gem::Dependency
|
95
|
-
name:
|
70
|
+
name: test-unit
|
96
71
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
72
|
requirements:
|
99
|
-
- -
|
73
|
+
- - "~>"
|
100
74
|
- !ruby/object:Gem::Version
|
101
|
-
version: '
|
75
|
+
version: '3'
|
102
76
|
type: :development
|
103
77
|
prerelease: false
|
104
78
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
79
|
requirements:
|
107
|
-
- -
|
80
|
+
- - "~>"
|
108
81
|
- !ruby/object:Gem::Version
|
109
|
-
version: '
|
110
|
-
description: Language
|
82
|
+
version: '3'
|
83
|
+
description: Ruby bindings for Chromium Compact Language Detector
|
111
84
|
email:
|
112
85
|
- vhyza@vhyza.eu
|
113
86
|
executables: []
|
@@ -115,7 +88,7 @@ extensions:
|
|
115
88
|
- ext/cld/extconf.rb
|
116
89
|
extra_rdoc_files: []
|
117
90
|
files:
|
118
|
-
- .gitignore
|
91
|
+
- ".gitignore"
|
119
92
|
- Gemfile
|
120
93
|
- LICENSE.txt
|
121
94
|
- README.md
|
@@ -140,7 +113,6 @@ files:
|
|
140
113
|
- ext/cld/base/template_util.h
|
141
114
|
- ext/cld/base/type_traits.h
|
142
115
|
- ext/cld/base/vlog_is_on.h
|
143
|
-
- ext/cld/cld.so
|
144
116
|
- ext/cld/encodings/compact_lang_det/cldutil.cc
|
145
117
|
- ext/cld/encodings/compact_lang_det/cldutil.h
|
146
118
|
- ext/cld/encodings/compact_lang_det/cldutil_dbg.h
|
@@ -209,41 +181,40 @@ files:
|
|
209
181
|
- ext/cld/languages/public/languages.h
|
210
182
|
- language_detection.gemspec
|
211
183
|
- lib/language_detection.rb
|
184
|
+
- lib/language_detection/language.rb
|
212
185
|
- lib/language_detection/string.rb
|
213
186
|
- lib/language_detection/version.rb
|
214
187
|
- test/_helper.rb
|
215
188
|
- test/fixtures/languages.csv
|
216
189
|
- test/language_detection_test.rb
|
217
|
-
homepage:
|
218
|
-
licenses:
|
219
|
-
|
190
|
+
homepage: https://github.com/vhyza/language_detection
|
191
|
+
licenses:
|
192
|
+
- MIT
|
193
|
+
metadata: {}
|
194
|
+
post_install_message:
|
220
195
|
rdoc_options: []
|
221
196
|
require_paths:
|
222
197
|
- lib
|
223
198
|
required_ruby_version: !ruby/object:Gem::Requirement
|
224
|
-
none: false
|
225
199
|
requirements:
|
226
|
-
- -
|
200
|
+
- - ">="
|
227
201
|
- !ruby/object:Gem::Version
|
228
|
-
version:
|
229
|
-
|
230
|
-
|
231
|
-
|
202
|
+
version: 2.5.0
|
203
|
+
- - "<"
|
204
|
+
- !ruby/object:Gem::Version
|
205
|
+
version: 3.3.0
|
232
206
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
233
|
-
none: false
|
234
207
|
requirements:
|
235
|
-
- -
|
208
|
+
- - ">="
|
236
209
|
- !ruby/object:Gem::Version
|
237
210
|
version: '0'
|
238
|
-
segments:
|
239
|
-
- 0
|
240
|
-
hash: 301210449373780646
|
241
211
|
requirements: []
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
212
|
+
rubygems_version: 3.3.7
|
213
|
+
signing_key:
|
214
|
+
specification_version: 4
|
215
|
+
summary: Ruby bindings for Chromium Compact Language Detector ([source](http://src.chromium.org/viewvc/chrome/trunk/src/third_party/cld/)).
|
216
|
+
This gem is using source codes from [chromium-compact-language-detector](http://code.google.com/p/chromium-compact-language-detector/)
|
217
|
+
port.
|
247
218
|
test_files:
|
248
219
|
- test/_helper.rb
|
249
220
|
- test/fixtures/languages.csv
|
data/ext/cld/cld.so
DELETED
Binary file
|