cld3 3.2.5 → 3.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/{ext/cld3/ext/LICENSE → LICENSE_CLD3} +0 -0
- data/cld3.gemspec +1 -1
- data/ext/cld3/Makefile +266 -0
- data/ext/cld3/{ext/src/base.cc → base.cc} +0 -0
- data/ext/cld3/{ext/src/base.h → base.h} +0 -0
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/{ext/src/casts.h → casts.h} +0 -0
- data/ext/cld3/{ext/src/embedding_feature_extractor.cc → embedding_feature_extractor.cc} +0 -0
- data/ext/cld3/{ext/src/embedding_feature_extractor.h → embedding_feature_extractor.h} +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/{ext/src/embedding_network.cc → embedding_network.cc} +0 -0
- data/ext/cld3/{ext/src/embedding_network.h → embedding_network.h} +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/{ext/src/embedding_network_params.h → embedding_network_params.h} +0 -0
- data/ext/cld3/{ext/src/feature_extractor.cc → feature_extractor.cc} +0 -0
- data/ext/cld3/{ext/src/feature_extractor.h → feature_extractor.h} +0 -0
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/{ext/src/feature_extractor.proto → feature_extractor.proto} +0 -0
- data/ext/cld3/{ext/src/feature_types.cc → feature_types.cc} +0 -0
- data/ext/cld3/{ext/src/feature_types.h → feature_types.h} +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/{ext/src/script_span/fixunicodevalue.cc → fixunicodevalue.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/fixunicodevalue.h → fixunicodevalue.h} +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/{ext/src/float16.h → float16.h} +0 -0
- data/ext/cld3/{ext/src/fml_parser.cc → fml_parser.cc} +0 -0
- data/ext/cld3/{ext/src/fml_parser.h → fml_parser.h} +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/{ext/src/script_span/generated_entities.cc → generated_entities.cc} +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/{ext/src/script_span/generated_ulscript.cc → generated_ulscript.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/generated_ulscript.h → generated_ulscript.h} +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/{ext/src/script_span/getonescriptspan.cc → getonescriptspan.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/getonescriptspan.h → getonescriptspan.h} +0 -0
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/{ext/src/script_span/integral_types.h → integral_types.h} +0 -0
- data/ext/cld3/{ext/src/lang_id_nn_params.cc → lang_id_nn_params.cc} +0 -0
- data/ext/cld3/{ext/src/lang_id_nn_params.h → lang_id_nn_params.h} +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/{ext/src/language_identifier_features.cc → language_identifier_features.cc} +0 -0
- data/ext/cld3/{ext/src/language_identifier_features.h → language_identifier_features.h} +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/mkmf.log +36 -0
- data/ext/cld3/{ext/src/nnet_language_identifier.cc → nnet_language_identifier.cc} +0 -0
- data/ext/cld3/{ext/src/nnet_language_identifier.h → nnet_language_identifier.h} +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/{ext/src/script_span/offsetmap.cc → offsetmap.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/offsetmap.h → offsetmap.h} +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/{ext/src/script_span/port.h → port.h} +0 -0
- data/ext/cld3/{ext/src/registry.cc → registry.cc} +0 -0
- data/ext/cld3/{ext/src/registry.h → registry.h} +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/{ext/src/relevant_script_feature.cc → relevant_script_feature.cc} +0 -0
- data/ext/cld3/{ext/src/relevant_script_feature.h → relevant_script_feature.h} +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/{ext/src/script_detector.h → script_detector.h} +0 -0
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/{ext/src/sentence.proto → sentence.proto} +0 -0
- data/ext/cld3/{ext/src/sentence_features.cc → sentence_features.cc} +0 -0
- data/ext/cld3/{ext/src/sentence_features.h → sentence_features.h} +0 -0
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/{ext/src/simple_adder.h → simple_adder.h} +0 -0
- data/ext/cld3/{ext/src/script_span/stringpiece.h → stringpiece.h} +0 -0
- data/ext/cld3/{ext/src/task_context.cc → task_context.cc} +0 -0
- data/ext/cld3/{ext/src/task_context.h → task_context.h} +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/{ext/src/task_context_params.cc → task_context_params.cc} +0 -0
- data/ext/cld3/{ext/src/task_context_params.h → task_context_params.h} +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/{ext/src/task_spec.proto → task_spec.proto} +0 -0
- data/ext/cld3/{ext/src/script_span/text_processing.cc → text_processing.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/text_processing.h → text_processing.h} +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/{ext/src/unicodetext.cc → unicodetext.cc} +0 -0
- data/ext/cld3/{ext/src/unicodetext.h → unicodetext.h} +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/{ext/src/script_span/utf8acceptinterchange.h → utf8acceptinterchange.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8prop_lettermarkscriptnum.h → utf8prop_lettermarkscriptnum.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8repl_lettermarklower.h → utf8repl_lettermarklower.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8scannot_lettermarkspecial.h → utf8scannot_lettermarkspecial.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8statetable.cc → utf8statetable.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8statetable.h → utf8statetable.h} +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/{ext/src/utils.cc → utils.cc} +0 -0
- data/ext/cld3/{ext/src/utils.h → utils.h} +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/{ext/src/workspace.cc → workspace.cc} +0 -0
- data/ext/cld3/{ext/src/workspace.h → workspace.h} +0 -0
- data/ext/cld3/workspace.o +0 -0
- metadata +96 -81
- data/ext/cld3/ext/CMakeLists.txt +0 -69
- data/ext/cld3/ext/CONTRIBUTING.md +0 -26
- data/ext/cld3/ext/README.md +0 -73
- data/ext/cld3/ext/misc/myprotobuf.cmake +0 -58
- data/ext/cld3/ext/model.png +0 -0
- data/ext/cld3/ext/src/BUILD.gn +0 -133
- data/ext/cld3/ext/src/DEPS +0 -4
- data/ext/cld3/ext/src/language_identifier_features_test.cc +0 -261
- data/ext/cld3/ext/src/language_identifier_main.cc +0 -54
- data/ext/cld3/ext/src/nnet_lang_id_test.cc +0 -254
- data/ext/cld3/ext/src/nnet_lang_id_test_data.cc +0 -529
- data/ext/cld3/ext/src/nnet_lang_id_test_data.h +0 -117
- data/ext/cld3/ext/src/relevant_script_feature_test.cc +0 -259
- data/ext/cld3/ext/src/script_detector_test.cc +0 -161
- data/ext/cld3/ext/src/script_span/README.md +0 -11
- data/ext/cld3/ext/src/script_span/getonescriptspan_test.cc +0 -135
@@ -1,11 +0,0 @@
|
|
1
|
-
The code in this directory identifies the scripts present in a given piece of
|
2
|
-
text along with the corresponding spans. The code was copied from
|
3
|
-
[CLD2](https://github.com/CLD2Owners/cld2) and was slightly refactored. It can
|
4
|
-
be further simplified and cleaned up.
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
@@ -1,135 +0,0 @@
|
|
1
|
-
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
-
|
3
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
you may not use this file except in compliance with the License.
|
5
|
-
You may obtain a copy of the License at
|
6
|
-
|
7
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
|
9
|
-
Unless required by applicable law or agreed to in writing, software
|
10
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
See the License for the specific language governing permissions and
|
13
|
-
limitations under the License.
|
14
|
-
==============================================================================*/
|
15
|
-
|
16
|
-
#include "getonescriptspan.h"
|
17
|
-
|
18
|
-
#include <iostream>
|
19
|
-
#include <vector>
|
20
|
-
|
21
|
-
namespace chrome_lang_id {
|
22
|
-
namespace CLD2 {
|
23
|
-
namespace getonescriptspan_test {
|
24
|
-
|
25
|
-
// Tests invalid and interchange-invalid input. Returns "true" if the test is
|
26
|
-
// successful and "false" otherwise.
|
27
|
-
bool TestInvalidUTF8Input() {
|
28
|
-
std::cout << "Running " << __FUNCTION__ << std::endl;
|
29
|
-
const std::vector<std::string> invalid_strings{"\xC0\xA9",
|
30
|
-
"\377\377\377\377"};
|
31
|
-
const std::string gold_valid_prefix = "Some valid bytes followed by ";
|
32
|
-
|
33
|
-
// Iterates over the invalid strings, inserts each of them in the middle of a
|
34
|
-
// piece of text, and checks whether these strings are correctly identified.
|
35
|
-
bool test_successful = true;
|
36
|
-
for (size_t i = 0; i < invalid_strings.size(); ++i) {
|
37
|
-
const std::string text = "Some valid bytes followed by " +
|
38
|
-
invalid_strings.at(i) +
|
39
|
-
" and then valid ones again.";
|
40
|
-
|
41
|
-
const int num_valid_bytes = SpanInterchangeValid(text.c_str(), text.size());
|
42
|
-
const std::string detected_valid_prefix(text.c_str(), num_valid_bytes);
|
43
|
-
std::cout << " Testing input string at position " << i << std::endl;
|
44
|
-
if (detected_valid_prefix == gold_valid_prefix) {
|
45
|
-
std::cout << " Success!" << std::endl;
|
46
|
-
} else {
|
47
|
-
std::cout << " Failure" << std::endl;
|
48
|
-
std::cout << " Gold: " << gold_valid_prefix << std::endl;
|
49
|
-
std::cout << " Detected: " << detected_valid_prefix << std::endl;
|
50
|
-
test_successful = false;
|
51
|
-
}
|
52
|
-
}
|
53
|
-
return test_successful;
|
54
|
-
}
|
55
|
-
|
56
|
-
// Tests whether different scripts are correctly detected. Returns "true" if the
|
57
|
-
// test is successful and "false" otherwise.
|
58
|
-
bool TestScriptDetection() {
|
59
|
-
std::cout << "Running " << __FUNCTION__ << std::endl;
|
60
|
-
|
61
|
-
// Text containing a snippet in English, a snippet in Bulgarian, and a snippet
|
62
|
-
// in English again.
|
63
|
-
const std::string text =
|
64
|
-
"Text in English. Текст на Български. Also text in English.";
|
65
|
-
const std::vector<std::string> gold_script_spans{
|
66
|
-
" Text in English ", " Текст на Български ", " Also text in English "};
|
67
|
-
|
68
|
-
std::vector<std::string> detected_script_spans;
|
69
|
-
ScriptScanner ss(text.c_str(), text.size(), /*is_plain_text=*/true);
|
70
|
-
LangSpan script_span;
|
71
|
-
while (ss.GetOneScriptSpan(&script_span)) {
|
72
|
-
detected_script_spans.emplace_back(script_span.text,
|
73
|
-
script_span.text_bytes);
|
74
|
-
}
|
75
|
-
|
76
|
-
if (detected_script_spans.size() != gold_script_spans.size()) {
|
77
|
-
std::cout << " Failure" << std::endl;
|
78
|
-
std::cout << " Number of gold spans " << gold_script_spans.size()
|
79
|
-
<< std::endl;
|
80
|
-
std::cout << " Number of detected spans " << detected_script_spans.size()
|
81
|
-
<< std::endl;
|
82
|
-
return false;
|
83
|
-
}
|
84
|
-
for (size_t i = 0; i < detected_script_spans.size(); ++i) {
|
85
|
-
if (detected_script_spans.at(i) != gold_script_spans.at(i)) {
|
86
|
-
std::cout << " Failure" << std::endl;
|
87
|
-
std::cout << " Gold span: " << gold_script_spans.at(i) << std::endl;
|
88
|
-
std::cout << " Detected span: " << detected_script_spans.at(i)
|
89
|
-
<< std::endl;
|
90
|
-
return false;
|
91
|
-
}
|
92
|
-
}
|
93
|
-
std::cout << " Success!" << std::endl;
|
94
|
-
return true;
|
95
|
-
}
|
96
|
-
|
97
|
-
// Tests the case when the input string is truncated in such a way that a
|
98
|
-
// character is split in two pieces. Returns "true" if the test is successful
|
99
|
-
// and "false" otherwise.
|
100
|
-
bool TestStringCut() {
|
101
|
-
std::cout << "Running " << __FUNCTION__ << std::endl;
|
102
|
-
|
103
|
-
// Text in Bulgarian (Cyrillic script).
|
104
|
-
const std::string text = "Текст на Български";
|
105
|
-
|
106
|
-
// The size of the first two words ("Текст на ") is 16, and size of the first
|
107
|
-
// two words plus the first char of the third word ("Текст на Б") is 18, so a
|
108
|
-
// threshold of 17 results in slicing the first char of the third word.
|
109
|
-
const int first_two_words_size = 16;
|
110
|
-
const int span_size = 17;
|
111
|
-
const int num_valid_bytes = SpanInterchangeValid(text.c_str(), span_size);
|
112
|
-
if (num_valid_bytes == first_two_words_size) {
|
113
|
-
std::cout << " Success!" << std::endl;
|
114
|
-
return true;
|
115
|
-
} else {
|
116
|
-
std::cout << " Failure" << std::endl;
|
117
|
-
std::cout << " Size of gold interchange-valid span: "
|
118
|
-
<< first_two_words_size << std::endl;
|
119
|
-
std::cout << " Size of detected span: " << num_valid_bytes << std::endl;
|
120
|
-
return false;
|
121
|
-
}
|
122
|
-
}
|
123
|
-
|
124
|
-
} // namespace getonescriptspan_test
|
125
|
-
} // namespace CLD2
|
126
|
-
} // namespace chrome_lang_id
|
127
|
-
|
128
|
-
// Runs the functions above.
|
129
|
-
int main(int argc, char **argv) {
|
130
|
-
const bool tests_successful =
|
131
|
-
chrome_lang_id::CLD2::getonescriptspan_test::TestInvalidUTF8Input() &&
|
132
|
-
chrome_lang_id::CLD2::getonescriptspan_test::TestScriptDetection() &&
|
133
|
-
chrome_lang_id::CLD2::getonescriptspan_test::TestStringCut();
|
134
|
-
return tests_successful ? 0 : 1;
|
135
|
-
}
|