cld3 3.2.5 → 3.2.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/{ext/cld3/ext/LICENSE → LICENSE_CLD3} +0 -0
  3. data/cld3.gemspec +1 -1
  4. data/ext/cld3/Makefile +266 -0
  5. data/ext/cld3/{ext/src/base.cc → base.cc} +0 -0
  6. data/ext/cld3/{ext/src/base.h → base.h} +0 -0
  7. data/ext/cld3/base.o +0 -0
  8. data/ext/cld3/{ext/src/casts.h → casts.h} +0 -0
  9. data/ext/cld3/{ext/src/embedding_feature_extractor.cc → embedding_feature_extractor.cc} +0 -0
  10. data/ext/cld3/{ext/src/embedding_feature_extractor.h → embedding_feature_extractor.h} +0 -0
  11. data/ext/cld3/embedding_feature_extractor.o +0 -0
  12. data/ext/cld3/{ext/src/embedding_network.cc → embedding_network.cc} +0 -0
  13. data/ext/cld3/{ext/src/embedding_network.h → embedding_network.h} +0 -0
  14. data/ext/cld3/embedding_network.o +0 -0
  15. data/ext/cld3/{ext/src/embedding_network_params.h → embedding_network_params.h} +0 -0
  16. data/ext/cld3/{ext/src/feature_extractor.cc → feature_extractor.cc} +0 -0
  17. data/ext/cld3/{ext/src/feature_extractor.h → feature_extractor.h} +0 -0
  18. data/ext/cld3/feature_extractor.o +0 -0
  19. data/ext/cld3/feature_extractor.pb.o +0 -0
  20. data/ext/cld3/{ext/src/feature_extractor.proto → feature_extractor.proto} +0 -0
  21. data/ext/cld3/{ext/src/feature_types.cc → feature_types.cc} +0 -0
  22. data/ext/cld3/{ext/src/feature_types.h → feature_types.h} +0 -0
  23. data/ext/cld3/feature_types.o +0 -0
  24. data/ext/cld3/{ext/src/script_span/fixunicodevalue.cc → fixunicodevalue.cc} +0 -0
  25. data/ext/cld3/{ext/src/script_span/fixunicodevalue.h → fixunicodevalue.h} +0 -0
  26. data/ext/cld3/fixunicodevalue.o +0 -0
  27. data/ext/cld3/{ext/src/float16.h → float16.h} +0 -0
  28. data/ext/cld3/{ext/src/fml_parser.cc → fml_parser.cc} +0 -0
  29. data/ext/cld3/{ext/src/fml_parser.h → fml_parser.h} +0 -0
  30. data/ext/cld3/fml_parser.o +0 -0
  31. data/ext/cld3/{ext/src/script_span/generated_entities.cc → generated_entities.cc} +0 -0
  32. data/ext/cld3/generated_entities.o +0 -0
  33. data/ext/cld3/{ext/src/script_span/generated_ulscript.cc → generated_ulscript.cc} +0 -0
  34. data/ext/cld3/{ext/src/script_span/generated_ulscript.h → generated_ulscript.h} +0 -0
  35. data/ext/cld3/generated_ulscript.o +0 -0
  36. data/ext/cld3/{ext/src/script_span/getonescriptspan.cc → getonescriptspan.cc} +0 -0
  37. data/ext/cld3/{ext/src/script_span/getonescriptspan.h → getonescriptspan.h} +0 -0
  38. data/ext/cld3/getonescriptspan.o +0 -0
  39. data/ext/cld3/{ext/src/script_span/integral_types.h → integral_types.h} +0 -0
  40. data/ext/cld3/{ext/src/lang_id_nn_params.cc → lang_id_nn_params.cc} +0 -0
  41. data/ext/cld3/{ext/src/lang_id_nn_params.h → lang_id_nn_params.h} +0 -0
  42. data/ext/cld3/lang_id_nn_params.o +0 -0
  43. data/ext/cld3/{ext/src/language_identifier_features.cc → language_identifier_features.cc} +0 -0
  44. data/ext/cld3/{ext/src/language_identifier_features.h → language_identifier_features.h} +0 -0
  45. data/ext/cld3/language_identifier_features.o +0 -0
  46. data/ext/cld3/libcld3.so +0 -0
  47. data/ext/cld3/mkmf.log +36 -0
  48. data/ext/cld3/{ext/src/nnet_language_identifier.cc → nnet_language_identifier.cc} +0 -0
  49. data/ext/cld3/{ext/src/nnet_language_identifier.h → nnet_language_identifier.h} +0 -0
  50. data/ext/cld3/nnet_language_identifier.o +0 -0
  51. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  52. data/ext/cld3/{ext/src/script_span/offsetmap.cc → offsetmap.cc} +0 -0
  53. data/ext/cld3/{ext/src/script_span/offsetmap.h → offsetmap.h} +0 -0
  54. data/ext/cld3/offsetmap.o +0 -0
  55. data/ext/cld3/{ext/src/script_span/port.h → port.h} +0 -0
  56. data/ext/cld3/{ext/src/registry.cc → registry.cc} +0 -0
  57. data/ext/cld3/{ext/src/registry.h → registry.h} +0 -0
  58. data/ext/cld3/registry.o +0 -0
  59. data/ext/cld3/{ext/src/relevant_script_feature.cc → relevant_script_feature.cc} +0 -0
  60. data/ext/cld3/{ext/src/relevant_script_feature.h → relevant_script_feature.h} +0 -0
  61. data/ext/cld3/relevant_script_feature.o +0 -0
  62. data/ext/cld3/{ext/src/script_detector.h → script_detector.h} +0 -0
  63. data/ext/cld3/sentence.pb.o +0 -0
  64. data/ext/cld3/{ext/src/sentence.proto → sentence.proto} +0 -0
  65. data/ext/cld3/{ext/src/sentence_features.cc → sentence_features.cc} +0 -0
  66. data/ext/cld3/{ext/src/sentence_features.h → sentence_features.h} +0 -0
  67. data/ext/cld3/sentence_features.o +0 -0
  68. data/ext/cld3/{ext/src/simple_adder.h → simple_adder.h} +0 -0
  69. data/ext/cld3/{ext/src/script_span/stringpiece.h → stringpiece.h} +0 -0
  70. data/ext/cld3/{ext/src/task_context.cc → task_context.cc} +0 -0
  71. data/ext/cld3/{ext/src/task_context.h → task_context.h} +0 -0
  72. data/ext/cld3/task_context.o +0 -0
  73. data/ext/cld3/{ext/src/task_context_params.cc → task_context_params.cc} +0 -0
  74. data/ext/cld3/{ext/src/task_context_params.h → task_context_params.h} +0 -0
  75. data/ext/cld3/task_context_params.o +0 -0
  76. data/ext/cld3/task_spec.pb.o +0 -0
  77. data/ext/cld3/{ext/src/task_spec.proto → task_spec.proto} +0 -0
  78. data/ext/cld3/{ext/src/script_span/text_processing.cc → text_processing.cc} +0 -0
  79. data/ext/cld3/{ext/src/script_span/text_processing.h → text_processing.h} +0 -0
  80. data/ext/cld3/text_processing.o +0 -0
  81. data/ext/cld3/{ext/src/unicodetext.cc → unicodetext.cc} +0 -0
  82. data/ext/cld3/{ext/src/unicodetext.h → unicodetext.h} +0 -0
  83. data/ext/cld3/unicodetext.o +0 -0
  84. data/ext/cld3/{ext/src/script_span/utf8acceptinterchange.h → utf8acceptinterchange.h} +0 -0
  85. data/ext/cld3/{ext/src/script_span/utf8prop_lettermarkscriptnum.h → utf8prop_lettermarkscriptnum.h} +0 -0
  86. data/ext/cld3/{ext/src/script_span/utf8repl_lettermarklower.h → utf8repl_lettermarklower.h} +0 -0
  87. data/ext/cld3/{ext/src/script_span/utf8scannot_lettermarkspecial.h → utf8scannot_lettermarkspecial.h} +0 -0
  88. data/ext/cld3/{ext/src/script_span/utf8statetable.cc → utf8statetable.cc} +0 -0
  89. data/ext/cld3/{ext/src/script_span/utf8statetable.h → utf8statetable.h} +0 -0
  90. data/ext/cld3/utf8statetable.o +0 -0
  91. data/ext/cld3/{ext/src/utils.cc → utils.cc} +0 -0
  92. data/ext/cld3/{ext/src/utils.h → utils.h} +0 -0
  93. data/ext/cld3/utils.o +0 -0
  94. data/ext/cld3/{ext/src/workspace.cc → workspace.cc} +0 -0
  95. data/ext/cld3/{ext/src/workspace.h → workspace.h} +0 -0
  96. data/ext/cld3/workspace.o +0 -0
  97. metadata +96 -81
  98. data/ext/cld3/ext/CMakeLists.txt +0 -69
  99. data/ext/cld3/ext/CONTRIBUTING.md +0 -26
  100. data/ext/cld3/ext/README.md +0 -73
  101. data/ext/cld3/ext/misc/myprotobuf.cmake +0 -58
  102. data/ext/cld3/ext/model.png +0 -0
  103. data/ext/cld3/ext/src/BUILD.gn +0 -133
  104. data/ext/cld3/ext/src/DEPS +0 -4
  105. data/ext/cld3/ext/src/language_identifier_features_test.cc +0 -261
  106. data/ext/cld3/ext/src/language_identifier_main.cc +0 -54
  107. data/ext/cld3/ext/src/nnet_lang_id_test.cc +0 -254
  108. data/ext/cld3/ext/src/nnet_lang_id_test_data.cc +0 -529
  109. data/ext/cld3/ext/src/nnet_lang_id_test_data.h +0 -117
  110. data/ext/cld3/ext/src/relevant_script_feature_test.cc +0 -259
  111. data/ext/cld3/ext/src/script_detector_test.cc +0 -161
  112. data/ext/cld3/ext/src/script_span/README.md +0 -11
  113. data/ext/cld3/ext/src/script_span/getonescriptspan_test.cc +0 -135
@@ -1,11 +0,0 @@
1
- The code in this directory identifies the scripts present in a given piece of
2
- text along with the corresponding spans. The code was copied from
3
- [CLD2](https://github.com/CLD2Owners/cld2) and was slightly refactored. It can
4
- be further simplified and cleaned up.
5
-
6
-
7
-
8
-
9
-
10
-
11
-
@@ -1,135 +0,0 @@
1
- /* Copyright 2016 Google Inc. All Rights Reserved.
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- ==============================================================================*/
15
-
16
- #include "getonescriptspan.h"
17
-
18
- #include <iostream>
19
- #include <vector>
20
-
21
- namespace chrome_lang_id {
22
- namespace CLD2 {
23
- namespace getonescriptspan_test {
24
-
25
- // Tests invalid and interchange-invalid input. Returns "true" if the test is
26
- // successful and "false" otherwise.
27
- bool TestInvalidUTF8Input() {
28
- std::cout << "Running " << __FUNCTION__ << std::endl;
29
- const std::vector<std::string> invalid_strings{"\xC0\xA9",
30
- "\377\377\377\377"};
31
- const std::string gold_valid_prefix = "Some valid bytes followed by ";
32
-
33
- // Iterates over the invalid strings, inserts each of them in the middle of a
34
- // piece of text, and checks whether these strings are correctly identified.
35
- bool test_successful = true;
36
- for (size_t i = 0; i < invalid_strings.size(); ++i) {
37
- const std::string text = "Some valid bytes followed by " +
38
- invalid_strings.at(i) +
39
- " and then valid ones again.";
40
-
41
- const int num_valid_bytes = SpanInterchangeValid(text.c_str(), text.size());
42
- const std::string detected_valid_prefix(text.c_str(), num_valid_bytes);
43
- std::cout << " Testing input string at position " << i << std::endl;
44
- if (detected_valid_prefix == gold_valid_prefix) {
45
- std::cout << " Success!" << std::endl;
46
- } else {
47
- std::cout << " Failure" << std::endl;
48
- std::cout << " Gold: " << gold_valid_prefix << std::endl;
49
- std::cout << " Detected: " << detected_valid_prefix << std::endl;
50
- test_successful = false;
51
- }
52
- }
53
- return test_successful;
54
- }
55
-
56
- // Tests whether different scripts are correctly detected. Returns "true" if the
57
- // test is successful and "false" otherwise.
58
- bool TestScriptDetection() {
59
- std::cout << "Running " << __FUNCTION__ << std::endl;
60
-
61
- // Text containing a snippet in English, a snippet in Bulgarian, and a snippet
62
- // in English again.
63
- const std::string text =
64
- "Text in English. Текст на Български. Also text in English.";
65
- const std::vector<std::string> gold_script_spans{
66
- " Text in English ", " Текст на Български ", " Also text in English "};
67
-
68
- std::vector<std::string> detected_script_spans;
69
- ScriptScanner ss(text.c_str(), text.size(), /*is_plain_text=*/true);
70
- LangSpan script_span;
71
- while (ss.GetOneScriptSpan(&script_span)) {
72
- detected_script_spans.emplace_back(script_span.text,
73
- script_span.text_bytes);
74
- }
75
-
76
- if (detected_script_spans.size() != gold_script_spans.size()) {
77
- std::cout << " Failure" << std::endl;
78
- std::cout << " Number of gold spans " << gold_script_spans.size()
79
- << std::endl;
80
- std::cout << " Number of detected spans " << detected_script_spans.size()
81
- << std::endl;
82
- return false;
83
- }
84
- for (size_t i = 0; i < detected_script_spans.size(); ++i) {
85
- if (detected_script_spans.at(i) != gold_script_spans.at(i)) {
86
- std::cout << " Failure" << std::endl;
87
- std::cout << " Gold span: " << gold_script_spans.at(i) << std::endl;
88
- std::cout << " Detected span: " << detected_script_spans.at(i)
89
- << std::endl;
90
- return false;
91
- }
92
- }
93
- std::cout << " Success!" << std::endl;
94
- return true;
95
- }
96
-
97
- // Tests the case when the input string is truncated in such a way that a
98
- // character is split in two pieces. Returns "true" if the test is successful
99
- // and "false" otherwise.
100
- bool TestStringCut() {
101
- std::cout << "Running " << __FUNCTION__ << std::endl;
102
-
103
- // Text in Bulgarian (Cyrillic script).
104
- const std::string text = "Текст на Български";
105
-
106
- // The size of the first two words ("Текст на ") is 16, and size of the first
107
- // two words plus the first char of the third word ("Текст на Б") is 18, so a
108
- // threshold of 17 results in slicing the first char of the third word.
109
- const int first_two_words_size = 16;
110
- const int span_size = 17;
111
- const int num_valid_bytes = SpanInterchangeValid(text.c_str(), span_size);
112
- if (num_valid_bytes == first_two_words_size) {
113
- std::cout << " Success!" << std::endl;
114
- return true;
115
- } else {
116
- std::cout << " Failure" << std::endl;
117
- std::cout << " Size of gold interchange-valid span: "
118
- << first_two_words_size << std::endl;
119
- std::cout << " Size of detected span: " << num_valid_bytes << std::endl;
120
- return false;
121
- }
122
- }
123
-
124
- } // namespace getonescriptspan_test
125
- } // namespace CLD2
126
- } // namespace chrome_lang_id
127
-
128
- // Runs the functions above.
129
- int main(int argc, char **argv) {
130
- const bool tests_successful =
131
- chrome_lang_id::CLD2::getonescriptspan_test::TestInvalidUTF8Input() &&
132
- chrome_lang_id::CLD2::getonescriptspan_test::TestScriptDetection() &&
133
- chrome_lang_id::CLD2::getonescriptspan_test::TestStringCut();
134
- return tests_successful ? 0 : 1;
135
- }