cld3 3.2.5 → 3.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/{ext/cld3/ext/LICENSE → LICENSE_CLD3} +0 -0
- data/cld3.gemspec +1 -1
- data/ext/cld3/Makefile +266 -0
- data/ext/cld3/{ext/src/base.cc → base.cc} +0 -0
- data/ext/cld3/{ext/src/base.h → base.h} +0 -0
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/{ext/src/casts.h → casts.h} +0 -0
- data/ext/cld3/{ext/src/embedding_feature_extractor.cc → embedding_feature_extractor.cc} +0 -0
- data/ext/cld3/{ext/src/embedding_feature_extractor.h → embedding_feature_extractor.h} +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/{ext/src/embedding_network.cc → embedding_network.cc} +0 -0
- data/ext/cld3/{ext/src/embedding_network.h → embedding_network.h} +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/{ext/src/embedding_network_params.h → embedding_network_params.h} +0 -0
- data/ext/cld3/{ext/src/feature_extractor.cc → feature_extractor.cc} +0 -0
- data/ext/cld3/{ext/src/feature_extractor.h → feature_extractor.h} +0 -0
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/{ext/src/feature_extractor.proto → feature_extractor.proto} +0 -0
- data/ext/cld3/{ext/src/feature_types.cc → feature_types.cc} +0 -0
- data/ext/cld3/{ext/src/feature_types.h → feature_types.h} +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/{ext/src/script_span/fixunicodevalue.cc → fixunicodevalue.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/fixunicodevalue.h → fixunicodevalue.h} +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/{ext/src/float16.h → float16.h} +0 -0
- data/ext/cld3/{ext/src/fml_parser.cc → fml_parser.cc} +0 -0
- data/ext/cld3/{ext/src/fml_parser.h → fml_parser.h} +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/{ext/src/script_span/generated_entities.cc → generated_entities.cc} +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/{ext/src/script_span/generated_ulscript.cc → generated_ulscript.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/generated_ulscript.h → generated_ulscript.h} +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/{ext/src/script_span/getonescriptspan.cc → getonescriptspan.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/getonescriptspan.h → getonescriptspan.h} +0 -0
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/{ext/src/script_span/integral_types.h → integral_types.h} +0 -0
- data/ext/cld3/{ext/src/lang_id_nn_params.cc → lang_id_nn_params.cc} +0 -0
- data/ext/cld3/{ext/src/lang_id_nn_params.h → lang_id_nn_params.h} +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/{ext/src/language_identifier_features.cc → language_identifier_features.cc} +0 -0
- data/ext/cld3/{ext/src/language_identifier_features.h → language_identifier_features.h} +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/mkmf.log +36 -0
- data/ext/cld3/{ext/src/nnet_language_identifier.cc → nnet_language_identifier.cc} +0 -0
- data/ext/cld3/{ext/src/nnet_language_identifier.h → nnet_language_identifier.h} +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/{ext/src/script_span/offsetmap.cc → offsetmap.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/offsetmap.h → offsetmap.h} +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/{ext/src/script_span/port.h → port.h} +0 -0
- data/ext/cld3/{ext/src/registry.cc → registry.cc} +0 -0
- data/ext/cld3/{ext/src/registry.h → registry.h} +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/{ext/src/relevant_script_feature.cc → relevant_script_feature.cc} +0 -0
- data/ext/cld3/{ext/src/relevant_script_feature.h → relevant_script_feature.h} +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/{ext/src/script_detector.h → script_detector.h} +0 -0
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/{ext/src/sentence.proto → sentence.proto} +0 -0
- data/ext/cld3/{ext/src/sentence_features.cc → sentence_features.cc} +0 -0
- data/ext/cld3/{ext/src/sentence_features.h → sentence_features.h} +0 -0
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/{ext/src/simple_adder.h → simple_adder.h} +0 -0
- data/ext/cld3/{ext/src/script_span/stringpiece.h → stringpiece.h} +0 -0
- data/ext/cld3/{ext/src/task_context.cc → task_context.cc} +0 -0
- data/ext/cld3/{ext/src/task_context.h → task_context.h} +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/{ext/src/task_context_params.cc → task_context_params.cc} +0 -0
- data/ext/cld3/{ext/src/task_context_params.h → task_context_params.h} +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/{ext/src/task_spec.proto → task_spec.proto} +0 -0
- data/ext/cld3/{ext/src/script_span/text_processing.cc → text_processing.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/text_processing.h → text_processing.h} +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/{ext/src/unicodetext.cc → unicodetext.cc} +0 -0
- data/ext/cld3/{ext/src/unicodetext.h → unicodetext.h} +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/{ext/src/script_span/utf8acceptinterchange.h → utf8acceptinterchange.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8prop_lettermarkscriptnum.h → utf8prop_lettermarkscriptnum.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8repl_lettermarklower.h → utf8repl_lettermarklower.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8scannot_lettermarkspecial.h → utf8scannot_lettermarkspecial.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8statetable.cc → utf8statetable.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8statetable.h → utf8statetable.h} +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/{ext/src/utils.cc → utils.cc} +0 -0
- data/ext/cld3/{ext/src/utils.h → utils.h} +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/{ext/src/workspace.cc → workspace.cc} +0 -0
- data/ext/cld3/{ext/src/workspace.h → workspace.h} +0 -0
- data/ext/cld3/workspace.o +0 -0
- metadata +96 -81
- data/ext/cld3/ext/CMakeLists.txt +0 -69
- data/ext/cld3/ext/CONTRIBUTING.md +0 -26
- data/ext/cld3/ext/README.md +0 -73
- data/ext/cld3/ext/misc/myprotobuf.cmake +0 -58
- data/ext/cld3/ext/model.png +0 -0
- data/ext/cld3/ext/src/BUILD.gn +0 -133
- data/ext/cld3/ext/src/DEPS +0 -4
- data/ext/cld3/ext/src/language_identifier_features_test.cc +0 -261
- data/ext/cld3/ext/src/language_identifier_main.cc +0 -54
- data/ext/cld3/ext/src/nnet_lang_id_test.cc +0 -254
- data/ext/cld3/ext/src/nnet_lang_id_test_data.cc +0 -529
- data/ext/cld3/ext/src/nnet_lang_id_test_data.h +0 -117
- data/ext/cld3/ext/src/relevant_script_feature_test.cc +0 -259
- data/ext/cld3/ext/src/script_detector_test.cc +0 -161
- data/ext/cld3/ext/src/script_span/README.md +0 -11
- data/ext/cld3/ext/src/script_span/getonescriptspan_test.cc +0 -135
data/ext/cld3/ext/model.png
DELETED
Binary file
|
data/ext/cld3/ext/src/BUILD.gn
DELETED
@@ -1,133 +0,0 @@
|
|
1
|
-
# Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
#==============================================================================
|
15
|
-
|
16
|
-
import("//third_party/protobuf/proto_library.gni")
|
17
|
-
|
18
|
-
proto_library("protos") {
|
19
|
-
sources = [
|
20
|
-
"feature_extractor.proto",
|
21
|
-
"sentence.proto",
|
22
|
-
"task_spec.proto",
|
23
|
-
]
|
24
|
-
proto_out_dir = "cld_3/protos"
|
25
|
-
}
|
26
|
-
|
27
|
-
static_library("cld_3") {
|
28
|
-
sources = [
|
29
|
-
"base.cc",
|
30
|
-
"base.h",
|
31
|
-
"casts.h",
|
32
|
-
"embedding_feature_extractor.cc",
|
33
|
-
"embedding_feature_extractor.h",
|
34
|
-
"embedding_network.cc",
|
35
|
-
"embedding_network.h",
|
36
|
-
"embedding_network_params.h",
|
37
|
-
"feature_extractor.cc",
|
38
|
-
"feature_extractor.h",
|
39
|
-
"feature_types.cc",
|
40
|
-
"feature_types.h",
|
41
|
-
"float16.h",
|
42
|
-
"fml_parser.cc",
|
43
|
-
"fml_parser.h",
|
44
|
-
"language_identifier_features.cc",
|
45
|
-
"language_identifier_features.h",
|
46
|
-
"lang_id_nn_params.cc",
|
47
|
-
"lang_id_nn_params.h",
|
48
|
-
"nnet_language_identifier.cc",
|
49
|
-
"nnet_language_identifier.h",
|
50
|
-
"registry.cc",
|
51
|
-
"registry.h",
|
52
|
-
"relevant_script_feature.cc",
|
53
|
-
"relevant_script_feature.h",
|
54
|
-
"script_detector.h",
|
55
|
-
"sentence_features.cc",
|
56
|
-
"sentence_features.h",
|
57
|
-
"simple_adder.h",
|
58
|
-
"script_span/fixunicodevalue.cc",
|
59
|
-
"script_span/fixunicodevalue.h",
|
60
|
-
"script_span/generated_entities.cc",
|
61
|
-
"script_span/generated_ulscript.cc",
|
62
|
-
"script_span/generated_ulscript.h",
|
63
|
-
"script_span/getonescriptspan.cc",
|
64
|
-
"script_span/getonescriptspan.h",
|
65
|
-
"script_span/integral_types.h",
|
66
|
-
"script_span/offsetmap.cc",
|
67
|
-
"script_span/offsetmap.h",
|
68
|
-
"script_span/port.h",
|
69
|
-
"script_span/stringpiece.h",
|
70
|
-
"script_span/text_processing.cc",
|
71
|
-
"script_span/text_processing.h",
|
72
|
-
"script_span/utf8acceptinterchange.h",
|
73
|
-
"script_span/utf8prop_lettermarkscriptnum.h",
|
74
|
-
"script_span/utf8repl_lettermarklower.h",
|
75
|
-
"script_span/utf8scannot_lettermarkspecial.h",
|
76
|
-
"script_span/utf8statetable.cc",
|
77
|
-
"script_span/utf8statetable.h",
|
78
|
-
"task_context.cc",
|
79
|
-
"task_context.h",
|
80
|
-
"task_context_params.cc",
|
81
|
-
"task_context_params.h",
|
82
|
-
"unicodetext.cc",
|
83
|
-
"unicodetext.h",
|
84
|
-
"utils.cc",
|
85
|
-
"utils.h",
|
86
|
-
"workspace.cc",
|
87
|
-
"workspace.h",
|
88
|
-
]
|
89
|
-
public_deps = [
|
90
|
-
"//third_party/protobuf:protobuf_lite",
|
91
|
-
":protos",
|
92
|
-
]
|
93
|
-
}
|
94
|
-
|
95
|
-
# The executables below are functional. Uncomment to use.
|
96
|
-
|
97
|
-
#executable("language_identifier_main") {
|
98
|
-
# sources = [
|
99
|
-
# "language_identifier_main.cc",
|
100
|
-
# ]
|
101
|
-
# deps = [
|
102
|
-
# ":cld_3",
|
103
|
-
# ]
|
104
|
-
#}
|
105
|
-
|
106
|
-
#executable("getonescriptspan_test") {
|
107
|
-
# sources = [
|
108
|
-
# "script_span/getonescriptspan_test.cc",
|
109
|
-
# ]
|
110
|
-
# deps = [
|
111
|
-
# ":cld_3",
|
112
|
-
# ]
|
113
|
-
#}
|
114
|
-
|
115
|
-
#executable("language_identifier_features_test") {
|
116
|
-
# sources = [
|
117
|
-
# "language_identifier_features_test.cc",
|
118
|
-
# ]
|
119
|
-
# deps = [
|
120
|
-
# ":cld_3",
|
121
|
-
# ]
|
122
|
-
#}
|
123
|
-
|
124
|
-
#executable("nnet_lang_id_test") {
|
125
|
-
# sources = [
|
126
|
-
# "nnet_lang_id_test.cc",
|
127
|
-
# "nnet_lang_id_test_data.cc",
|
128
|
-
# "nnet_lang_id_test_data.h",
|
129
|
-
# ]
|
130
|
-
# deps = [
|
131
|
-
# ":cld_3",
|
132
|
-
# ]
|
133
|
-
#}
|
data/ext/cld3/ext/src/DEPS
DELETED
@@ -1,261 +0,0 @@
|
|
1
|
-
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
-
|
3
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
you may not use this file except in compliance with the License.
|
5
|
-
You may obtain a copy of the License at
|
6
|
-
|
7
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
|
9
|
-
Unless required by applicable law or agreed to in writing, software
|
10
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
See the License for the specific language governing permissions and
|
13
|
-
limitations under the License.
|
14
|
-
==============================================================================*/
|
15
|
-
|
16
|
-
#include <cmath>
|
17
|
-
#include <iostream>
|
18
|
-
#include <vector>
|
19
|
-
#include <set>
|
20
|
-
|
21
|
-
#include "base.h"
|
22
|
-
#include "feature_extractor.h"
|
23
|
-
#include "language_identifier_features.h"
|
24
|
-
#include "nnet_language_identifier.h"
|
25
|
-
#include "script_span/generated_ulscript.h"
|
26
|
-
#include "cld_3/protos/sentence.pb.h"
|
27
|
-
#include "task_context.h"
|
28
|
-
#include "utils.h"
|
29
|
-
#include "workspace.h"
|
30
|
-
|
31
|
-
namespace chrome_lang_id {
|
32
|
-
namespace language_identifier_features_test {
|
33
|
-
|
34
|
-
static WholeSentenceFeature *cbog_factory() {
|
35
|
-
return new ContinuousBagOfNgramsFunction;
|
36
|
-
}
|
37
|
-
|
38
|
-
static WholeSentenceFeature *sf_factory() { return new ScriptFeature; }
|
39
|
-
|
40
|
-
// Class for calculating the feature weights and ids.
|
41
|
-
class FeatureIdWeightCalculator {
|
42
|
-
public:
|
43
|
-
explicit FeatureIdWeightCalculator(TaskContext *context) {
|
44
|
-
if (WholeSentenceFeature::registry() == nullptr) {
|
45
|
-
// Create registry for our WholeSentenceFeature(s).
|
46
|
-
RegisterableClass<WholeSentenceFeature>::CreateRegistry(
|
47
|
-
"sentence feature function", "WholeSentenceFeature", __FILE__,
|
48
|
-
__LINE__);
|
49
|
-
}
|
50
|
-
|
51
|
-
// Register our WholeSentenceFeature(s).
|
52
|
-
// Register ContinuousBagOfNgramsFunction feature function.
|
53
|
-
static WholeSentenceFeature::Registry::Registrar cbog_registrar(
|
54
|
-
WholeSentenceFeature::registry(), "continuous-bag-of-ngrams",
|
55
|
-
"ContinuousBagOfNgramsFunction", __FILE__, __LINE__, cbog_factory);
|
56
|
-
|
57
|
-
// Register Script feature function.
|
58
|
-
static WholeSentenceFeature::Registry::Registrar sf_registrar(
|
59
|
-
WholeSentenceFeature::registry(), "script", "ScriptFeature", __FILE__,
|
60
|
-
__LINE__, sf_factory);
|
61
|
-
|
62
|
-
feature_extractor_.Setup(context);
|
63
|
-
feature_extractor_.Init(context);
|
64
|
-
}
|
65
|
-
|
66
|
-
// Assumes that a single feature is specified and extracts it.
|
67
|
-
void ExtractOnlyFeature(Sentence *sentence,
|
68
|
-
std::vector<FeatureVector> *features) {
|
69
|
-
CLD3_CHECK(features->size() == 1);
|
70
|
-
WorkspaceSet workspace;
|
71
|
-
workspace.Reset(workspace_registry_);
|
72
|
-
feature_extractor_.Preprocess(&workspace, sentence);
|
73
|
-
feature_extractor_.ExtractFeatures(workspace, *sentence, features);
|
74
|
-
CLD3_CHECK(features->size() == 1);
|
75
|
-
}
|
76
|
-
|
77
|
-
// Returns a map from feature value id to feature value weight.
|
78
|
-
std::unordered_map<int, float> GetFloatFeatureValIdsAndWeights(
|
79
|
-
Sentence *sentence) {
|
80
|
-
std::vector<FeatureVector> feature_vectors(1); // one feature space
|
81
|
-
ExtractOnlyFeature(sentence, &feature_vectors);
|
82
|
-
const FeatureVector &feature_vector = feature_vectors.at(0);
|
83
|
-
|
84
|
-
// Save the (feature value id, feature value weight) pairs to a map.
|
85
|
-
std::unordered_map<int, float> feature_id_weight;
|
86
|
-
for (int index = 0; index < feature_vector.size(); ++index) {
|
87
|
-
const FloatFeatureValue feature_value =
|
88
|
-
FloatFeatureValue(feature_vector.value(index));
|
89
|
-
feature_id_weight[feature_value.value.id] = feature_value.value.weight;
|
90
|
-
}
|
91
|
-
return feature_id_weight;
|
92
|
-
}
|
93
|
-
|
94
|
-
// Returns the feature value ids.
|
95
|
-
std::set<int> GetFeatureValueIds(Sentence *sentence) {
|
96
|
-
std::vector<FeatureVector> feature_vectors(1); // one feature space
|
97
|
-
ExtractOnlyFeature(sentence, &feature_vectors);
|
98
|
-
const FeatureVector &feature_vector = feature_vectors.at(0);
|
99
|
-
|
100
|
-
std::set<int> ids;
|
101
|
-
for (int index = 0; index < feature_vector.size(); ++index) {
|
102
|
-
ids.insert(feature_vector.value(index));
|
103
|
-
}
|
104
|
-
return ids;
|
105
|
-
}
|
106
|
-
|
107
|
-
private:
|
108
|
-
// The registry of shared workspaces in the feature extractor.
|
109
|
-
WorkspaceRegistry workspace_registry_;
|
110
|
-
LanguageIdEmbeddingFeatureExtractor feature_extractor_;
|
111
|
-
};
|
112
|
-
|
113
|
-
// Extracts features and checks that their ids and weights are correct.
|
114
|
-
bool ExtractAndCheckFeatures(const string &features, const int id_dim,
|
115
|
-
const std::vector<string> &expected_char_ngrams,
|
116
|
-
const std::vector<float> &expected_weights,
|
117
|
-
Sentence *sentence) {
|
118
|
-
TaskContext context;
|
119
|
-
context.SetParameter("language_identifier_features", features);
|
120
|
-
FeatureIdWeightCalculator calc(&context);
|
121
|
-
|
122
|
-
// Get the feature ids and the corresponding weights.
|
123
|
-
const std::unordered_map<int, float> feature_id_weight =
|
124
|
-
calc.GetFloatFeatureValIdsAndWeights(sentence);
|
125
|
-
if (feature_id_weight.size() != expected_char_ngrams.size()) {
|
126
|
-
std::cout << " Failure" << std::endl;
|
127
|
-
std::cout << " Number of expected feature ids: "
|
128
|
-
<< expected_char_ngrams.size() << std::endl;
|
129
|
-
std::cout << " Number of extracted feature ids: "
|
130
|
-
<< feature_id_weight.size() << std::endl;
|
131
|
-
return false;
|
132
|
-
}
|
133
|
-
|
134
|
-
// Specifies how close two float values should be to be considered equal.
|
135
|
-
const float epsilon = 0.0001f;
|
136
|
-
bool test_successful = true;
|
137
|
-
for (size_t i = 0; i < expected_char_ngrams.size(); ++i) {
|
138
|
-
const int expected_id =
|
139
|
-
utils::Hash32WithDefaultSeed(expected_char_ngrams.at(i)) % id_dim;
|
140
|
-
|
141
|
-
// Check the ids and the weights.
|
142
|
-
if (feature_id_weight.count(expected_id) == 0) {
|
143
|
-
std::cout << " Failure" << std::endl;
|
144
|
-
std::cout << " Feature id " << expected_id << " is missing" << std::endl;
|
145
|
-
test_successful = false;
|
146
|
-
} else {
|
147
|
-
if (std::abs(feature_id_weight.at(expected_id) - expected_weights.at(i)) >
|
148
|
-
epsilon) {
|
149
|
-
std::cout << " Failure" << std::endl;
|
150
|
-
std::cout << " Different weight for feature id " << expected_id
|
151
|
-
<< ": expected weight " << expected_weights.at(i)
|
152
|
-
<< ", actual weight " << feature_id_weight.at(expected_id)
|
153
|
-
<< std::endl;
|
154
|
-
test_successful = false;
|
155
|
-
}
|
156
|
-
}
|
157
|
-
}
|
158
|
-
|
159
|
-
if (test_successful) {
|
160
|
-
std::cout << " Success!" << std::endl;
|
161
|
-
}
|
162
|
-
return test_successful;
|
163
|
-
}
|
164
|
-
|
165
|
-
// Tests the case when ngram features get equal weight. Returns "true" if the
|
166
|
-
// test is successful and "false" otherwise.
|
167
|
-
bool TestExtractFeaturesWithEqualWeight() {
|
168
|
-
std::cout << "Running " << __FUNCTION__ << std::endl;
|
169
|
-
|
170
|
-
// The integer id of each char ngram is computed as follows:
|
171
|
-
// utils::Hash32WithDefaultSeed(char ngram) % id_dim.
|
172
|
-
const int id_dim = 100;
|
173
|
-
const string features = "continuous-bag-of-ngrams(id_dim=" +
|
174
|
-
std::to_string(id_dim) +
|
175
|
-
",size=2,include_terminators=true,include_" +
|
176
|
-
"spaces=false,use_equal_weight=true)";
|
177
|
-
Sentence sentence;
|
178
|
-
sentence.set_text("aa aab");
|
179
|
-
const std::vector<string> expected_char_ngrams{"ab", "b$", "^a", "aa", "a$"};
|
180
|
-
const std::vector<float> expected_weights = {0.2f, 0.2f, 0.2f, 0.2f, 0.2f};
|
181
|
-
return ExtractAndCheckFeatures(features, id_dim, expected_char_ngrams,
|
182
|
-
expected_weights, &sentence);
|
183
|
-
}
|
184
|
-
|
185
|
-
// Tests the case when ngram features get weights equal to their normalized
|
186
|
-
// counts. Returns "true" if the test is successful and "false" otherwise.
|
187
|
-
bool TestExtractFeaturesWithNonEqualWeight() {
|
188
|
-
std::cout << "Running " << __FUNCTION__ << std::endl;
|
189
|
-
|
190
|
-
// The integer id of each char ngram is computed as follows:
|
191
|
-
// utils::Hash32WithDefaultSeed(char ngram) % id_dim.
|
192
|
-
const int id_dim = 100;
|
193
|
-
const string features = "continuous-bag-of-ngrams(id_dim=" +
|
194
|
-
std::to_string(id_dim) +
|
195
|
-
",size=2,include_terminators=true,include_" +
|
196
|
-
"spaces=false,use_equal_weight=false)";
|
197
|
-
Sentence sentence;
|
198
|
-
sentence.set_text("aa aab");
|
199
|
-
const std::vector<string> expected_char_ngrams{"ab", "b$", "^a", "aa", "a$"};
|
200
|
-
const std::vector<float> expected_weights{0.1428f, 0.1428f, 0.2857f, 0.2857f,
|
201
|
-
0.1428f};
|
202
|
-
return ExtractAndCheckFeatures(features, id_dim, expected_char_ngrams,
|
203
|
-
expected_weights, &sentence);
|
204
|
-
}
|
205
|
-
|
206
|
-
// Tests the feature Script.
|
207
|
-
bool TestScriptFeature() {
|
208
|
-
std::cout << "Running " << __FUNCTION__ << std::endl;
|
209
|
-
|
210
|
-
bool test_successful = true;
|
211
|
-
TaskContext context;
|
212
|
-
context.SetParameter("language_identifier_features", "script");
|
213
|
-
FeatureIdWeightCalculator calc(&context);
|
214
|
-
|
215
|
-
// Check the script of the English sentence.
|
216
|
-
Sentence sentence;
|
217
|
-
sentence.set_text("food");
|
218
|
-
std::set<int> feature_val_ids = calc.GetFeatureValueIds(&sentence);
|
219
|
-
if (feature_val_ids.size() != 1 ||
|
220
|
-
feature_val_ids.count(chrome_lang_id::CLD2::ULScript_Latin) == 0) {
|
221
|
-
test_successful = false;
|
222
|
-
std::cout << " Failure for input: " << sentence.text() << std::endl;
|
223
|
-
}
|
224
|
-
|
225
|
-
// Check the script of a Chinese sentence.
|
226
|
-
sentence.set_text("字");
|
227
|
-
feature_val_ids = calc.GetFeatureValueIds(&sentence);
|
228
|
-
if (feature_val_ids.size() != 1 ||
|
229
|
-
feature_val_ids.count(chrome_lang_id::CLD2::ULScript_Hani) == 0) {
|
230
|
-
test_successful = false;
|
231
|
-
std::cout << " Failure for input: " << sentence.text() << std::endl;
|
232
|
-
}
|
233
|
-
|
234
|
-
// Check the script of a Korean sentence.
|
235
|
-
sentence.set_text("워드");
|
236
|
-
feature_val_ids = calc.GetFeatureValueIds(&sentence);
|
237
|
-
if (feature_val_ids.size() != 1 ||
|
238
|
-
feature_val_ids.count(chrome_lang_id::CLD2::NUM_ULSCRIPTS) == 0) {
|
239
|
-
test_successful = false;
|
240
|
-
std::cout << " Failure for input: " << sentence.text() << std::endl;
|
241
|
-
}
|
242
|
-
|
243
|
-
if (test_successful) {
|
244
|
-
std::cout << " Success!" << std::endl;
|
245
|
-
}
|
246
|
-
return test_successful;
|
247
|
-
}
|
248
|
-
|
249
|
-
} // namespace language_identifier_features_test
|
250
|
-
} // namespace chrome_lang_id
|
251
|
-
|
252
|
-
// Runs the feature extraction tests.
|
253
|
-
int main(int argc, char **argv) {
|
254
|
-
const bool tests_successful =
|
255
|
-
chrome_lang_id::language_identifier_features_test::
|
256
|
-
TestExtractFeaturesWithEqualWeight() &&
|
257
|
-
chrome_lang_id::language_identifier_features_test::
|
258
|
-
TestExtractFeaturesWithNonEqualWeight() &&
|
259
|
-
chrome_lang_id::language_identifier_features_test::TestScriptFeature();
|
260
|
-
return tests_successful ? 0 : 1;
|
261
|
-
}
|
@@ -1,54 +0,0 @@
|
|
1
|
-
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
-
|
3
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
you may not use this file except in compliance with the License.
|
5
|
-
You may obtain a copy of the License at
|
6
|
-
|
7
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
|
9
|
-
Unless required by applicable law or agreed to in writing, software
|
10
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
See the License for the specific language governing permissions and
|
13
|
-
limitations under the License.
|
14
|
-
==============================================================================*/
|
15
|
-
|
16
|
-
#include <iostream>
|
17
|
-
#include <string>
|
18
|
-
|
19
|
-
#include "base.h"
|
20
|
-
#include "nnet_language_identifier.h"
|
21
|
-
|
22
|
-
using chrome_lang_id::NNetLanguageIdentifier;
|
23
|
-
|
24
|
-
// Runs a neural net model for language identification.
|
25
|
-
int main(int argc, char **argv) {
|
26
|
-
NNetLanguageIdentifier lang_id(/*min_num_bytes=*/0,
|
27
|
-
/*max_num_bytes=*/1000);
|
28
|
-
|
29
|
-
const std::vector<std::string> texts{"This text is written in English.",
|
30
|
-
"Text in deutscher Sprache verfasst."};
|
31
|
-
for (const std::string &text : texts) {
|
32
|
-
const NNetLanguageIdentifier::Result result = lang_id.FindLanguage(text);
|
33
|
-
std::cout << "text: " << text << std::endl
|
34
|
-
<< " language: " << result.language << std::endl
|
35
|
-
<< " probability: " << result.probability << std::endl
|
36
|
-
<< " reliable: " << result.is_reliable << std::endl
|
37
|
-
<< " proportion: " << result.proportion << std::endl
|
38
|
-
<< std::endl;
|
39
|
-
}
|
40
|
-
|
41
|
-
const std::string &text =
|
42
|
-
"This piece of text is in English. Този текст е на Български.";
|
43
|
-
std::cout << "text: " << text << std::endl;
|
44
|
-
const std::vector<NNetLanguageIdentifier::Result> results =
|
45
|
-
lang_id.FindTopNMostFreqLangs(text, /*num_langs*/ 3);
|
46
|
-
for (const NNetLanguageIdentifier::Result &result : results) {
|
47
|
-
std::cout << " language: " << result.language << std::endl
|
48
|
-
<< " probability: " << result.probability << std::endl
|
49
|
-
<< " reliable: " << result.is_reliable << std::endl
|
50
|
-
<< " proportion: " << result.proportion << std::endl
|
51
|
-
<< std::endl;
|
52
|
-
}
|
53
|
-
return 0;
|
54
|
-
}
|