cld 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/Gemfile +6 -0
- data/README.md +34 -0
- data/Rakefile +4 -14
- data/cld.gemspec +16 -27
- data/ext/cld/Makefile +31 -0
- data/{base → ext/cld/base}/basictypes.h +0 -0
- data/{base → ext/cld/base}/build_config.h +0 -0
- data/{base → ext/cld/base}/casts.h +0 -0
- data/{base → ext/cld/base}/commandlineflags.h +0 -0
- data/{base → ext/cld/base}/crash.h +0 -0
- data/{base → ext/cld/base}/dynamic_annotations.h +0 -0
- data/{base → ext/cld/base}/global_strip_options.h +0 -0
- data/{base → ext/cld/base}/log_severity.h +0 -0
- data/{base → ext/cld/base}/logging.h +0 -0
- data/{base → ext/cld/base}/macros.h +0 -0
- data/{base → ext/cld/base}/port.h +0 -0
- data/{base → ext/cld/base}/scoped_ptr.h +0 -0
- data/{base → ext/cld/base}/stl_decl.h +0 -0
- data/{base → ext/cld/base}/stl_decl_msvc.h +0 -0
- data/{base → ext/cld/base}/string_util.h +0 -0
- data/{base → ext/cld/base}/strtoint.h +0 -0
- data/{base → ext/cld/base}/template_util.h +0 -0
- data/{base → ext/cld/base}/type_traits.h +0 -0
- data/{base → ext/cld/base}/vlog_is_on.h +0 -0
- data/{cld_encodings.h → ext/cld/cld_encodings.h} +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#cldutil.cc# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#cldutil.h# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#compact_lang_det_impl.h# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#ext_lang_enc.cc# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#ext_lang_enc.h# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#getonescriptspan.cc# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#getonescriptspan.h# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#tote.cc# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#tote.h# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/cldutil.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/cldutil.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/cldutil_dbg.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/cldutil_dbg_empty.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compact_lang_det.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compact_lang_det.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compact_lang_det_impl.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compact_lang_det_impl.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compact_lang_det_unittest_small.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compile.cmd +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/ext_lang_enc.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/ext_lang_enc.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/getonescriptspan.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/getonescriptspan.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/letterscript_enum.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/letterscript_enum.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/subsetsequence.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/subsetsequence.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/subsetsequence_unittest.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/tote.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/tote.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/unittest_data.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/utf8propjustletter.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/utf8propletterscriptnum.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/utf8scannotjustletterspecial.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/#cld_unilib_windows.cc# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_basictypes.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_commandlineflags.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_google.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_htmlutils.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_htmlutils_google3.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_htmlutils_windows.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_logging.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_macros.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_strtoint.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_unicodetext.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_unicodetext.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_unilib.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_unilib_google3.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_unilib_windows.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf8statetable.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf8statetable.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf8utils.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf8utils_google3.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf8utils_windows.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/normalizedunicodetext.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/normalizedunicodetext.h +0 -0
- data/{encodings → ext/cld/encodings}/internal/encodings.cc +0 -0
- data/{encodings → ext/cld/encodings}/lang_enc.h +0 -0
- data/{encodings → ext/cld/encodings}/proto/encodings.pb.h +0 -0
- data/{encodings → ext/cld/encodings}/public/encodings.h +0 -0
- data/ext/cld/extconf.rb +2 -7
- data/{languages → ext/cld/languages}/internal/#languages.cc# +0 -0
- data/{languages → ext/cld/languages}/internal/languages.cc +0 -0
- data/{languages → ext/cld/languages}/proto/languages.pb.h +0 -0
- data/{languages → ext/cld/languages}/public/languages.h +0 -0
- data/ext/cld/thunk.cc +56 -0
- data/lib/cld.rb +15 -6
- data/lib/cld/version.rb +3 -0
- data/spec/cld_spec.rb +44 -0
- data/spec/spec_helper.rb +6 -0
- metadata +132 -125
- data/Manifest +0 -105
- data/README.rdoc +0 -173
- data/build.sh +0 -48
- data/build.win.cmd +0 -28
- data/test/test.rb +0 -570
- data/thunk.cc +0 -131
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# Compact Language Detection
|
2
|
+
|
3
|
+
Blazing-fast language detection for Ruby provided by
|
4
|
+
Google Chrome's Compact Language Detector.
|
5
|
+
|
6
|
+
## How to Use
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
CLD.detect_language("This is a test")
|
10
|
+
# => {:name => "ENGLISH", :code => "en", :reliable => true}
|
11
|
+
|
12
|
+
CLD.detect_language("plus ça change, plus c'est la même chose")
|
13
|
+
# => {:name => "FRENCH", :code => "fr", :reliable => true}
|
14
|
+
```
|
15
|
+
|
16
|
+
## Installation
|
17
|
+
|
18
|
+
Add this line to your application's Gemfile:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem "cld"
|
22
|
+
```
|
23
|
+
|
24
|
+
And then execute:
|
25
|
+
|
26
|
+
```sh
|
27
|
+
$ bundle
|
28
|
+
```
|
29
|
+
|
30
|
+
## Thanks
|
31
|
+
|
32
|
+
Thanks to the Chrome authors, and to Mike McCandless for writing a Python version.
|
33
|
+
|
34
|
+
Licensed the same as Chrome. Jason Toy
|
data/Rakefile
CHANGED
@@ -1,15 +1,5 @@
|
|
1
|
-
|
2
|
-
require
|
3
|
-
require 'echoe'
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
4
3
|
|
5
|
-
|
6
|
-
|
7
|
-
p.url = "http://github.com/jtoy/cld"
|
8
|
-
p.author = "Jason Toy"
|
9
|
-
p.email = "jtoy@jtoy.net"
|
10
|
-
p.ignore_pattern = ["tmp/*", "script/*"]
|
11
|
-
p.runtime_dependencies = ["ffi"]
|
12
|
-
p.development_dependencies = []
|
13
|
-
end
|
14
|
-
|
15
|
-
#Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
|
4
|
+
require "rspec/core/rake_task"
|
5
|
+
RSpec::Core::RakeTask.new("spec")
|
data/cld.gemspec
CHANGED
@@ -1,33 +1,22 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/cld/version', __FILE__)
|
2
3
|
|
3
|
-
Gem::Specification.new do |
|
4
|
-
|
5
|
-
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Jason Toy"]
|
6
|
+
gem.email = ["jtoy@jtoy.net"]
|
7
|
+
gem.description = %q{Compact Language Detection for Ruby}
|
8
|
+
gem.summary = %q{Compact Language Detection for Ruby}
|
9
|
+
gem.homepage = "http://github.com/jtoy/cld"
|
6
10
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
s.files = [%q{LICENSE}, %q{README.rdoc}, %q{Rakefile}, %q{base/basictypes.h}, %q{base/build_config.h}, %q{base/casts.h}, %q{base/commandlineflags.h}, %q{base/crash.h}, %q{base/dynamic_annotations.h}, %q{base/global_strip_options.h}, %q{base/log_severity.h}, %q{base/logging.h}, %q{base/macros.h}, %q{base/port.h}, %q{base/scoped_ptr.h}, %q{base/stl_decl.h}, %q{base/stl_decl_msvc.h}, %q{base/string_util.h}, %q{base/strtoint.h}, %q{base/template_util.h}, %q{base/type_traits.h}, %q{base/vlog_is_on.h}, %q{build.sh}, %q{build.win.cmd}, %q{cld_encodings.h}, %q{encodings/compact_lang_det/#cldutil.cc#}, %q{encodings/compact_lang_det/#cldutil.h#}, %q{encodings/compact_lang_det/#compact_lang_det_impl.h#}, %q{encodings/compact_lang_det/#ext_lang_enc.cc#}, %q{encodings/compact_lang_det/#ext_lang_enc.h#}, %q{encodings/compact_lang_det/#getonescriptspan.cc#}, %q{encodings/compact_lang_det/#getonescriptspan.h#}, %q{encodings/compact_lang_det/#tote.cc#}, %q{encodings/compact_lang_det/#tote.h#}, %q{encodings/compact_lang_det/cldutil.cc}, %q{encodings/compact_lang_det/cldutil.h}, %q{encodings/compact_lang_det/cldutil_dbg.h}, %q{encodings/compact_lang_det/cldutil_dbg_empty.cc}, %q{encodings/compact_lang_det/compact_lang_det.cc}, %q{encodings/compact_lang_det/compact_lang_det.h}, %q{encodings/compact_lang_det/compact_lang_det_impl.cc}, %q{encodings/compact_lang_det/compact_lang_det_impl.h}, %q{encodings/compact_lang_det/compact_lang_det_unittest_small.cc}, %q{encodings/compact_lang_det/compile.cmd}, %q{encodings/compact_lang_det/ext_lang_enc.cc}, %q{encodings/compact_lang_det/ext_lang_enc.h}, %q{encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc}, %q{encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc}, %q{encodings/compact_lang_det/getonescriptspan.cc}, %q{encodings/compact_lang_det/getonescriptspan.h}, %q{encodings/compact_lang_det/letterscript_enum.cc}, %q{encodings/compact_lang_det/letterscript_enum.h}, %q{encodings/compact_lang_det/subsetsequence.cc}, %q{encodings/compact_lang_det/subsetsequence.h}, %q{encodings/compact_lang_det/subsetsequence_unittest.cc}, %q{encodings/compact_lang_det/tote.cc}, %q{encodings/compact_lang_det/tote.h}, %q{encodings/compact_lang_det/unittest_data.h}, %q{encodings/compact_lang_det/utf8propjustletter.h}, %q{encodings/compact_lang_det/utf8propletterscriptnum.h}, %q{encodings/compact_lang_det/utf8scannotjustletterspecial.h}, %q{encodings/compact_lang_det/win/#cld_unilib_windows.cc#}, %q{encodings/compact_lang_det/win/cld_basictypes.h}, %q{encodings/compact_lang_det/win/cld_commandlineflags.h}, %q{encodings/compact_lang_det/win/cld_google.h}, %q{encodings/compact_lang_det/win/cld_htmlutils.h}, %q{encodings/compact_lang_det/win/cld_htmlutils_google3.cc}, %q{encodings/compact_lang_det/win/cld_htmlutils_windows.cc}, %q{encodings/compact_lang_det/win/cld_logging.h}, %q{encodings/compact_lang_det/win/cld_macros.h}, %q{encodings/compact_lang_det/win/cld_strtoint.h}, %q{encodings/compact_lang_det/win/cld_unicodetext.cc}, %q{encodings/compact_lang_det/win/cld_unicodetext.h}, %q{encodings/compact_lang_det/win/cld_unilib.h}, %q{encodings/compact_lang_det/win/cld_unilib_google3.cc}, %q{encodings/compact_lang_det/win/cld_unilib_windows.cc}, %q{encodings/compact_lang_det/win/cld_utf.h}, %q{encodings/compact_lang_det/win/cld_utf8statetable.cc}, %q{encodings/compact_lang_det/win/cld_utf8statetable.h}, %q{encodings/compact_lang_det/win/cld_utf8utils.h}, %q{encodings/compact_lang_det/win/cld_utf8utils_google3.cc}, %q{encodings/compact_lang_det/win/cld_utf8utils_windows.cc}, %q{encodings/compact_lang_det/win/normalizedunicodetext.cc}, %q{encodings/compact_lang_det/win/normalizedunicodetext.h}, %q{encodings/internal/encodings.cc}, %q{encodings/lang_enc.h}, %q{encodings/proto/encodings.pb.h}, %q{encodings/public/encodings.h}, %q{ext/cld/extconf.rb}, %q{languages/internal/#languages.cc#}, %q{languages/internal/languages.cc}, %q{languages/proto/languages.pb.h}, %q{languages/public/languages.h}, %q{lib/cld.rb}, %q{test/test.rb}, %q{thunk.cc}, %q{Manifest}, %q{cld.gemspec}]
|
15
|
-
s.homepage = %q{http://github.com/jtoy/cld}
|
16
|
-
s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Cld}, %q{--main}, %q{README.rdoc}]
|
17
|
-
s.require_paths = [%q{lib}, %q{ext}]
|
18
|
-
s.rubyforge_project = %q{cld}
|
19
|
-
s.rubygems_version = %q{1.8.6.1}
|
20
|
-
s.summary = %q{Compact Language Detection from chrome}
|
11
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
12
|
+
gem.files = `git ls-files`.split("\n")
|
13
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
14
|
+
gem.extensions = ["ext/cld/extconf.rb"]
|
15
|
+
gem.name = "cld"
|
16
|
+
gem.require_paths = ["lib"]
|
17
|
+
gem.version = CLD::VERSION
|
21
18
|
|
22
|
-
|
23
|
-
s.specification_version = 3
|
19
|
+
gem.add_dependency "ffi"
|
24
20
|
|
25
|
-
|
26
|
-
s.add_runtime_dependency(%q<ffi>, [">= 0"])
|
27
|
-
else
|
28
|
-
s.add_dependency(%q<ffi>, [">= 0"])
|
29
|
-
end
|
30
|
-
else
|
31
|
-
s.add_dependency(%q<ffi>, [">= 0"])
|
32
|
-
end
|
21
|
+
gem.add_development_dependency "rspec"
|
33
22
|
end
|
data/ext/cld/Makefile
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
CFLAGS=-fPIC -I. -O2 -DCLD_WINDOWS
|
2
|
+
LDFLAGS=-L.
|
3
|
+
CC=g++
|
4
|
+
AR=ar
|
5
|
+
SOURCES=encodings/compact_lang_det/cldutil.cc \
|
6
|
+
encodings/compact_lang_det/cldutil_dbg_empty.cc \
|
7
|
+
encodings/compact_lang_det/compact_lang_det.cc \
|
8
|
+
encodings/compact_lang_det/compact_lang_det_impl.cc \
|
9
|
+
encodings/compact_lang_det/ext_lang_enc.cc \
|
10
|
+
encodings/compact_lang_det/getonescriptspan.cc \
|
11
|
+
encodings/compact_lang_det/letterscript_enum.cc \
|
12
|
+
encodings/compact_lang_det/tote.cc \
|
13
|
+
encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc \
|
14
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc \
|
15
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc \
|
16
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc \
|
17
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc \
|
18
|
+
encodings/compact_lang_det/win/cld_htmlutils_windows.cc \
|
19
|
+
encodings/compact_lang_det/win/cld_unilib_windows.cc \
|
20
|
+
encodings/compact_lang_det/win/cld_utf8statetable.cc \
|
21
|
+
encodings/compact_lang_det/win/cld_utf8utils_windows.cc \
|
22
|
+
encodings/internal/encodings.cc \
|
23
|
+
languages/internal/languages.cc \
|
24
|
+
thunk.cc
|
25
|
+
|
26
|
+
install:
|
27
|
+
rm -f *.o
|
28
|
+
rm -f libcld.a
|
29
|
+
$(CC) -c $(CFLAGS) $(SOURCES)
|
30
|
+
$(AR) rcs libcld.a *.o
|
31
|
+
$(CC) -DCLD_WINDOWS -I. -L. -shared -o cld.so -lstdc++ *.o
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/ext/cld/extconf.rb
CHANGED
@@ -1,7 +1,2 @@
|
|
1
|
-
require
|
2
|
-
|
3
|
-
puts home_dir
|
4
|
-
cmd = "cd #{home_dir}; ./build.sh"
|
5
|
-
sh cmd
|
6
|
-
sh "mv #{home_dir}/cld.so #{home_dir}/ext/cld/"
|
7
|
-
sh "echo 'install:\n\tdate' > #{home_dir}/ext/cld/Makefile"
|
1
|
+
#require 'mkmf'
|
2
|
+
#create_makefile('cld/cld')
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/ext/cld/thunk.cc
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include "encodings/compact_lang_det/compact_lang_det.h"
|
4
|
+
#include "encodings/compact_lang_det/ext_lang_enc.h"
|
5
|
+
#include "encodings/compact_lang_det/unittest_data.h"
|
6
|
+
#include "encodings/proto/encodings.pb.h"
|
7
|
+
|
8
|
+
typedef struct {
|
9
|
+
const char *name;
|
10
|
+
const char *code;
|
11
|
+
bool reliable;
|
12
|
+
} RESULT;
|
13
|
+
|
14
|
+
extern "C" {
|
15
|
+
RESULT detectLanguageThunkInt(const char * src) {
|
16
|
+
bool is_plain_text = true;
|
17
|
+
bool do_allow_extended_languages = true;
|
18
|
+
bool do_pick_summary_language = false;
|
19
|
+
bool do_remove_weak_matches = false;
|
20
|
+
bool is_reliable;
|
21
|
+
Language plus_one = UNKNOWN_LANGUAGE;
|
22
|
+
const char* tld_hint = NULL;
|
23
|
+
int encoding_hint = UNKNOWN_ENCODING;
|
24
|
+
Language language_hint = UNKNOWN_LANGUAGE;
|
25
|
+
|
26
|
+
double normalized_score3[3];
|
27
|
+
Language language3[3];
|
28
|
+
int percent3[3];
|
29
|
+
int text_bytes;
|
30
|
+
|
31
|
+
Language lang;
|
32
|
+
lang = CompactLangDet::DetectLanguage(0,
|
33
|
+
src, strlen(src),
|
34
|
+
is_plain_text,
|
35
|
+
do_allow_extended_languages,
|
36
|
+
do_pick_summary_language,
|
37
|
+
do_remove_weak_matches,
|
38
|
+
tld_hint,
|
39
|
+
encoding_hint,
|
40
|
+
language_hint,
|
41
|
+
language3,
|
42
|
+
percent3,
|
43
|
+
normalized_score3,
|
44
|
+
&text_bytes,
|
45
|
+
&is_reliable);
|
46
|
+
|
47
|
+
RESULT res;
|
48
|
+
res.name = LanguageName(lang);
|
49
|
+
res.code = ExtLanguageCode(lang);
|
50
|
+
res.reliable = is_reliable;
|
51
|
+
return res;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
int main(int argc, char **argv) {
|
56
|
+
}
|
data/lib/cld.rb
CHANGED
@@ -1,12 +1,21 @@
|
|
1
|
-
require "
|
1
|
+
require "cld/version"
|
2
2
|
require "ffi"
|
3
3
|
|
4
4
|
module CLD
|
5
5
|
extend FFI::Library
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
detect_language(text) == 0
|
6
|
+
|
7
|
+
def self.detect_language(text)
|
8
|
+
result = detect_language_ext(text)
|
9
|
+
Hash[ result.members.map {|member| [member.to_sym, result[member]]} ]
|
11
10
|
end
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
class ReturnValue < FFI::Struct
|
15
|
+
layout :name, :string, :code, :string, :reliable, :bool
|
16
|
+
end
|
17
|
+
|
18
|
+
GEM_ROOT = File.expand_path("../../", __FILE__)
|
19
|
+
ffi_lib "#{GEM_ROOT}/ext/cld/cld.so"
|
20
|
+
attach_function "detect_language_ext","detectLanguageThunkInt", [:buffer_in], ReturnValue.by_value
|
12
21
|
end
|