cld 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/Gemfile +6 -0
- data/README.md +34 -0
- data/Rakefile +4 -14
- data/cld.gemspec +16 -27
- data/ext/cld/Makefile +31 -0
- data/{base → ext/cld/base}/basictypes.h +0 -0
- data/{base → ext/cld/base}/build_config.h +0 -0
- data/{base → ext/cld/base}/casts.h +0 -0
- data/{base → ext/cld/base}/commandlineflags.h +0 -0
- data/{base → ext/cld/base}/crash.h +0 -0
- data/{base → ext/cld/base}/dynamic_annotations.h +0 -0
- data/{base → ext/cld/base}/global_strip_options.h +0 -0
- data/{base → ext/cld/base}/log_severity.h +0 -0
- data/{base → ext/cld/base}/logging.h +0 -0
- data/{base → ext/cld/base}/macros.h +0 -0
- data/{base → ext/cld/base}/port.h +0 -0
- data/{base → ext/cld/base}/scoped_ptr.h +0 -0
- data/{base → ext/cld/base}/stl_decl.h +0 -0
- data/{base → ext/cld/base}/stl_decl_msvc.h +0 -0
- data/{base → ext/cld/base}/string_util.h +0 -0
- data/{base → ext/cld/base}/strtoint.h +0 -0
- data/{base → ext/cld/base}/template_util.h +0 -0
- data/{base → ext/cld/base}/type_traits.h +0 -0
- data/{base → ext/cld/base}/vlog_is_on.h +0 -0
- data/{cld_encodings.h → ext/cld/cld_encodings.h} +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#cldutil.cc# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#cldutil.h# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#compact_lang_det_impl.h# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#ext_lang_enc.cc# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#ext_lang_enc.h# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#getonescriptspan.cc# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#getonescriptspan.h# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#tote.cc# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/#tote.h# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/cldutil.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/cldutil.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/cldutil_dbg.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/cldutil_dbg_empty.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compact_lang_det.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compact_lang_det.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compact_lang_det_impl.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compact_lang_det_impl.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compact_lang_det_unittest_small.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/compile.cmd +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/ext_lang_enc.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/ext_lang_enc.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/getonescriptspan.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/getonescriptspan.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/letterscript_enum.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/letterscript_enum.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/subsetsequence.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/subsetsequence.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/subsetsequence_unittest.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/tote.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/tote.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/unittest_data.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/utf8propjustletter.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/utf8propletterscriptnum.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/utf8scannotjustletterspecial.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/#cld_unilib_windows.cc# +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_basictypes.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_commandlineflags.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_google.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_htmlutils.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_htmlutils_google3.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_htmlutils_windows.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_logging.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_macros.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_strtoint.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_unicodetext.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_unicodetext.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_unilib.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_unilib_google3.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_unilib_windows.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf8statetable.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf8statetable.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf8utils.h +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf8utils_google3.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/cld_utf8utils_windows.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/normalizedunicodetext.cc +0 -0
- data/{encodings → ext/cld/encodings}/compact_lang_det/win/normalizedunicodetext.h +0 -0
- data/{encodings → ext/cld/encodings}/internal/encodings.cc +0 -0
- data/{encodings → ext/cld/encodings}/lang_enc.h +0 -0
- data/{encodings → ext/cld/encodings}/proto/encodings.pb.h +0 -0
- data/{encodings → ext/cld/encodings}/public/encodings.h +0 -0
- data/ext/cld/extconf.rb +2 -7
- data/{languages → ext/cld/languages}/internal/#languages.cc# +0 -0
- data/{languages → ext/cld/languages}/internal/languages.cc +0 -0
- data/{languages → ext/cld/languages}/proto/languages.pb.h +0 -0
- data/{languages → ext/cld/languages}/public/languages.h +0 -0
- data/ext/cld/thunk.cc +56 -0
- data/lib/cld.rb +15 -6
- data/lib/cld/version.rb +3 -0
- data/spec/cld_spec.rb +44 -0
- data/spec/spec_helper.rb +6 -0
- metadata +132 -125
- data/Manifest +0 -105
- data/README.rdoc +0 -173
- data/build.sh +0 -48
- data/build.win.cmd +0 -28
- data/test/test.rb +0 -570
- data/thunk.cc +0 -131
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Compact Language Detection
|
|
2
|
+
|
|
3
|
+
Blazing-fast language detection for Ruby provided by
|
|
4
|
+
Google Chrome's Compact Language Detector.
|
|
5
|
+
|
|
6
|
+
## How to Use
|
|
7
|
+
|
|
8
|
+
```ruby
|
|
9
|
+
CLD.detect_language("This is a test")
|
|
10
|
+
# => {:name => "ENGLISH", :code => "en", :reliable => true}
|
|
11
|
+
|
|
12
|
+
CLD.detect_language("plus ça change, plus c'est la même chose")
|
|
13
|
+
# => {:name => "FRENCH", :code => "fr", :reliable => true}
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
Add this line to your application's Gemfile:
|
|
19
|
+
|
|
20
|
+
```ruby
|
|
21
|
+
gem "cld"
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
And then execute:
|
|
25
|
+
|
|
26
|
+
```sh
|
|
27
|
+
$ bundle
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Thanks
|
|
31
|
+
|
|
32
|
+
Thanks to the Chrome authors, and to Mike McCandless for writing a Python version.
|
|
33
|
+
|
|
34
|
+
Licensed the same as Chrome. Jason Toy
|
data/Rakefile
CHANGED
|
@@ -1,15 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
require
|
|
3
|
-
require 'echoe'
|
|
1
|
+
#!/usr/bin/env rake
|
|
2
|
+
require "bundler/gem_tasks"
|
|
4
3
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
p.url = "http://github.com/jtoy/cld"
|
|
8
|
-
p.author = "Jason Toy"
|
|
9
|
-
p.email = "jtoy@jtoy.net"
|
|
10
|
-
p.ignore_pattern = ["tmp/*", "script/*"]
|
|
11
|
-
p.runtime_dependencies = ["ffi"]
|
|
12
|
-
p.development_dependencies = []
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
#Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
|
|
4
|
+
require "rspec/core/rake_task"
|
|
5
|
+
RSpec::Core::RakeTask.new("spec")
|
data/cld.gemspec
CHANGED
|
@@ -1,33 +1,22 @@
|
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
|
2
|
+
require File.expand_path('../lib/cld/version', __FILE__)
|
|
2
3
|
|
|
3
|
-
Gem::Specification.new do |
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
Gem::Specification.new do |gem|
|
|
5
|
+
gem.authors = ["Jason Toy"]
|
|
6
|
+
gem.email = ["jtoy@jtoy.net"]
|
|
7
|
+
gem.description = %q{Compact Language Detection for Ruby}
|
|
8
|
+
gem.summary = %q{Compact Language Detection for Ruby}
|
|
9
|
+
gem.homepage = "http://github.com/jtoy/cld"
|
|
6
10
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
s.files = [%q{LICENSE}, %q{README.rdoc}, %q{Rakefile}, %q{base/basictypes.h}, %q{base/build_config.h}, %q{base/casts.h}, %q{base/commandlineflags.h}, %q{base/crash.h}, %q{base/dynamic_annotations.h}, %q{base/global_strip_options.h}, %q{base/log_severity.h}, %q{base/logging.h}, %q{base/macros.h}, %q{base/port.h}, %q{base/scoped_ptr.h}, %q{base/stl_decl.h}, %q{base/stl_decl_msvc.h}, %q{base/string_util.h}, %q{base/strtoint.h}, %q{base/template_util.h}, %q{base/type_traits.h}, %q{base/vlog_is_on.h}, %q{build.sh}, %q{build.win.cmd}, %q{cld_encodings.h}, %q{encodings/compact_lang_det/#cldutil.cc#}, %q{encodings/compact_lang_det/#cldutil.h#}, %q{encodings/compact_lang_det/#compact_lang_det_impl.h#}, %q{encodings/compact_lang_det/#ext_lang_enc.cc#}, %q{encodings/compact_lang_det/#ext_lang_enc.h#}, %q{encodings/compact_lang_det/#getonescriptspan.cc#}, %q{encodings/compact_lang_det/#getonescriptspan.h#}, %q{encodings/compact_lang_det/#tote.cc#}, %q{encodings/compact_lang_det/#tote.h#}, %q{encodings/compact_lang_det/cldutil.cc}, %q{encodings/compact_lang_det/cldutil.h}, %q{encodings/compact_lang_det/cldutil_dbg.h}, %q{encodings/compact_lang_det/cldutil_dbg_empty.cc}, %q{encodings/compact_lang_det/compact_lang_det.cc}, %q{encodings/compact_lang_det/compact_lang_det.h}, %q{encodings/compact_lang_det/compact_lang_det_impl.cc}, %q{encodings/compact_lang_det/compact_lang_det_impl.h}, %q{encodings/compact_lang_det/compact_lang_det_unittest_small.cc}, %q{encodings/compact_lang_det/compile.cmd}, %q{encodings/compact_lang_det/ext_lang_enc.cc}, %q{encodings/compact_lang_det/ext_lang_enc.h}, %q{encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc}, %q{encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc}, %q{encodings/compact_lang_det/getonescriptspan.cc}, %q{encodings/compact_lang_det/getonescriptspan.h}, %q{encodings/compact_lang_det/letterscript_enum.cc}, %q{encodings/compact_lang_det/letterscript_enum.h}, %q{encodings/compact_lang_det/subsetsequence.cc}, %q{encodings/compact_lang_det/subsetsequence.h}, %q{encodings/compact_lang_det/subsetsequence_unittest.cc}, %q{encodings/compact_lang_det/tote.cc}, %q{encodings/compact_lang_det/tote.h}, %q{encodings/compact_lang_det/unittest_data.h}, %q{encodings/compact_lang_det/utf8propjustletter.h}, %q{encodings/compact_lang_det/utf8propletterscriptnum.h}, %q{encodings/compact_lang_det/utf8scannotjustletterspecial.h}, %q{encodings/compact_lang_det/win/#cld_unilib_windows.cc#}, %q{encodings/compact_lang_det/win/cld_basictypes.h}, %q{encodings/compact_lang_det/win/cld_commandlineflags.h}, %q{encodings/compact_lang_det/win/cld_google.h}, %q{encodings/compact_lang_det/win/cld_htmlutils.h}, %q{encodings/compact_lang_det/win/cld_htmlutils_google3.cc}, %q{encodings/compact_lang_det/win/cld_htmlutils_windows.cc}, %q{encodings/compact_lang_det/win/cld_logging.h}, %q{encodings/compact_lang_det/win/cld_macros.h}, %q{encodings/compact_lang_det/win/cld_strtoint.h}, %q{encodings/compact_lang_det/win/cld_unicodetext.cc}, %q{encodings/compact_lang_det/win/cld_unicodetext.h}, %q{encodings/compact_lang_det/win/cld_unilib.h}, %q{encodings/compact_lang_det/win/cld_unilib_google3.cc}, %q{encodings/compact_lang_det/win/cld_unilib_windows.cc}, %q{encodings/compact_lang_det/win/cld_utf.h}, %q{encodings/compact_lang_det/win/cld_utf8statetable.cc}, %q{encodings/compact_lang_det/win/cld_utf8statetable.h}, %q{encodings/compact_lang_det/win/cld_utf8utils.h}, %q{encodings/compact_lang_det/win/cld_utf8utils_google3.cc}, %q{encodings/compact_lang_det/win/cld_utf8utils_windows.cc}, %q{encodings/compact_lang_det/win/normalizedunicodetext.cc}, %q{encodings/compact_lang_det/win/normalizedunicodetext.h}, %q{encodings/internal/encodings.cc}, %q{encodings/lang_enc.h}, %q{encodings/proto/encodings.pb.h}, %q{encodings/public/encodings.h}, %q{ext/cld/extconf.rb}, %q{languages/internal/#languages.cc#}, %q{languages/internal/languages.cc}, %q{languages/proto/languages.pb.h}, %q{languages/public/languages.h}, %q{lib/cld.rb}, %q{test/test.rb}, %q{thunk.cc}, %q{Manifest}, %q{cld.gemspec}]
|
|
15
|
-
s.homepage = %q{http://github.com/jtoy/cld}
|
|
16
|
-
s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Cld}, %q{--main}, %q{README.rdoc}]
|
|
17
|
-
s.require_paths = [%q{lib}, %q{ext}]
|
|
18
|
-
s.rubyforge_project = %q{cld}
|
|
19
|
-
s.rubygems_version = %q{1.8.6.1}
|
|
20
|
-
s.summary = %q{Compact Language Detection from chrome}
|
|
11
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
|
12
|
+
gem.files = `git ls-files`.split("\n")
|
|
13
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
|
14
|
+
gem.extensions = ["ext/cld/extconf.rb"]
|
|
15
|
+
gem.name = "cld"
|
|
16
|
+
gem.require_paths = ["lib"]
|
|
17
|
+
gem.version = CLD::VERSION
|
|
21
18
|
|
|
22
|
-
|
|
23
|
-
s.specification_version = 3
|
|
19
|
+
gem.add_dependency "ffi"
|
|
24
20
|
|
|
25
|
-
|
|
26
|
-
s.add_runtime_dependency(%q<ffi>, [">= 0"])
|
|
27
|
-
else
|
|
28
|
-
s.add_dependency(%q<ffi>, [">= 0"])
|
|
29
|
-
end
|
|
30
|
-
else
|
|
31
|
-
s.add_dependency(%q<ffi>, [">= 0"])
|
|
32
|
-
end
|
|
21
|
+
gem.add_development_dependency "rspec"
|
|
33
22
|
end
|
data/ext/cld/Makefile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
CFLAGS=-fPIC -I. -O2 -DCLD_WINDOWS
|
|
2
|
+
LDFLAGS=-L.
|
|
3
|
+
CC=g++
|
|
4
|
+
AR=ar
|
|
5
|
+
SOURCES=encodings/compact_lang_det/cldutil.cc \
|
|
6
|
+
encodings/compact_lang_det/cldutil_dbg_empty.cc \
|
|
7
|
+
encodings/compact_lang_det/compact_lang_det.cc \
|
|
8
|
+
encodings/compact_lang_det/compact_lang_det_impl.cc \
|
|
9
|
+
encodings/compact_lang_det/ext_lang_enc.cc \
|
|
10
|
+
encodings/compact_lang_det/getonescriptspan.cc \
|
|
11
|
+
encodings/compact_lang_det/letterscript_enum.cc \
|
|
12
|
+
encodings/compact_lang_det/tote.cc \
|
|
13
|
+
encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc \
|
|
14
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc \
|
|
15
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc \
|
|
16
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc \
|
|
17
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc \
|
|
18
|
+
encodings/compact_lang_det/win/cld_htmlutils_windows.cc \
|
|
19
|
+
encodings/compact_lang_det/win/cld_unilib_windows.cc \
|
|
20
|
+
encodings/compact_lang_det/win/cld_utf8statetable.cc \
|
|
21
|
+
encodings/compact_lang_det/win/cld_utf8utils_windows.cc \
|
|
22
|
+
encodings/internal/encodings.cc \
|
|
23
|
+
languages/internal/languages.cc \
|
|
24
|
+
thunk.cc
|
|
25
|
+
|
|
26
|
+
install:
|
|
27
|
+
rm -f *.o
|
|
28
|
+
rm -f libcld.a
|
|
29
|
+
$(CC) -c $(CFLAGS) $(SOURCES)
|
|
30
|
+
$(AR) rcs libcld.a *.o
|
|
31
|
+
$(CC) -DCLD_WINDOWS -I. -L. -shared -o cld.so -lstdc++ *.o
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
data/{encodings → ext/cld/encodings}/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
data/ext/cld/extconf.rb
CHANGED
|
@@ -1,7 +1,2 @@
|
|
|
1
|
-
require
|
|
2
|
-
|
|
3
|
-
puts home_dir
|
|
4
|
-
cmd = "cd #{home_dir}; ./build.sh"
|
|
5
|
-
sh cmd
|
|
6
|
-
sh "mv #{home_dir}/cld.so #{home_dir}/ext/cld/"
|
|
7
|
-
sh "echo 'install:\n\tdate' > #{home_dir}/ext/cld/Makefile"
|
|
1
|
+
#require 'mkmf'
|
|
2
|
+
#create_makefile('cld/cld')
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
data/ext/cld/thunk.cc
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#include <stdio.h>
|
|
2
|
+
#include <string.h>
|
|
3
|
+
#include "encodings/compact_lang_det/compact_lang_det.h"
|
|
4
|
+
#include "encodings/compact_lang_det/ext_lang_enc.h"
|
|
5
|
+
#include "encodings/compact_lang_det/unittest_data.h"
|
|
6
|
+
#include "encodings/proto/encodings.pb.h"
|
|
7
|
+
|
|
8
|
+
typedef struct {
|
|
9
|
+
const char *name;
|
|
10
|
+
const char *code;
|
|
11
|
+
bool reliable;
|
|
12
|
+
} RESULT;
|
|
13
|
+
|
|
14
|
+
extern "C" {
|
|
15
|
+
RESULT detectLanguageThunkInt(const char * src) {
|
|
16
|
+
bool is_plain_text = true;
|
|
17
|
+
bool do_allow_extended_languages = true;
|
|
18
|
+
bool do_pick_summary_language = false;
|
|
19
|
+
bool do_remove_weak_matches = false;
|
|
20
|
+
bool is_reliable;
|
|
21
|
+
Language plus_one = UNKNOWN_LANGUAGE;
|
|
22
|
+
const char* tld_hint = NULL;
|
|
23
|
+
int encoding_hint = UNKNOWN_ENCODING;
|
|
24
|
+
Language language_hint = UNKNOWN_LANGUAGE;
|
|
25
|
+
|
|
26
|
+
double normalized_score3[3];
|
|
27
|
+
Language language3[3];
|
|
28
|
+
int percent3[3];
|
|
29
|
+
int text_bytes;
|
|
30
|
+
|
|
31
|
+
Language lang;
|
|
32
|
+
lang = CompactLangDet::DetectLanguage(0,
|
|
33
|
+
src, strlen(src),
|
|
34
|
+
is_plain_text,
|
|
35
|
+
do_allow_extended_languages,
|
|
36
|
+
do_pick_summary_language,
|
|
37
|
+
do_remove_weak_matches,
|
|
38
|
+
tld_hint,
|
|
39
|
+
encoding_hint,
|
|
40
|
+
language_hint,
|
|
41
|
+
language3,
|
|
42
|
+
percent3,
|
|
43
|
+
normalized_score3,
|
|
44
|
+
&text_bytes,
|
|
45
|
+
&is_reliable);
|
|
46
|
+
|
|
47
|
+
RESULT res;
|
|
48
|
+
res.name = LanguageName(lang);
|
|
49
|
+
res.code = ExtLanguageCode(lang);
|
|
50
|
+
res.reliable = is_reliable;
|
|
51
|
+
return res;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
int main(int argc, char **argv) {
|
|
56
|
+
}
|
data/lib/cld.rb
CHANGED
|
@@ -1,12 +1,21 @@
|
|
|
1
|
-
require "
|
|
1
|
+
require "cld/version"
|
|
2
2
|
require "ffi"
|
|
3
3
|
|
|
4
4
|
module CLD
|
|
5
5
|
extend FFI::Library
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
detect_language(text) == 0
|
|
6
|
+
|
|
7
|
+
def self.detect_language(text)
|
|
8
|
+
result = detect_language_ext(text)
|
|
9
|
+
Hash[ result.members.map {|member| [member.to_sym, result[member]]} ]
|
|
11
10
|
end
|
|
11
|
+
|
|
12
|
+
private
|
|
13
|
+
|
|
14
|
+
class ReturnValue < FFI::Struct
|
|
15
|
+
layout :name, :string, :code, :string, :reliable, :bool
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
GEM_ROOT = File.expand_path("../../", __FILE__)
|
|
19
|
+
ffi_lib "#{GEM_ROOT}/ext/cld/cld.so"
|
|
20
|
+
attach_function "detect_language_ext","detectLanguageThunkInt", [:buffer_in], ReturnValue.by_value
|
|
12
21
|
end
|