cld3 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/LICENSE +204 -0
- data/LICENSE_CLD3 +203 -0
- data/README.md +22 -0
- data/cld3.gemspec +35 -0
- data/ext/cld3/base.cc +36 -0
- data/ext/cld3/base.h +106 -0
- data/ext/cld3/casts.h +98 -0
- data/ext/cld3/embedding_feature_extractor.cc +51 -0
- data/ext/cld3/embedding_feature_extractor.h +182 -0
- data/ext/cld3/embedding_network.cc +196 -0
- data/ext/cld3/embedding_network.h +186 -0
- data/ext/cld3/embedding_network_params.h +285 -0
- data/ext/cld3/extconf.rb +49 -0
- data/ext/cld3/feature_extractor.cc +137 -0
- data/ext/cld3/feature_extractor.h +633 -0
- data/ext/cld3/feature_extractor.proto +50 -0
- data/ext/cld3/feature_types.cc +72 -0
- data/ext/cld3/feature_types.h +158 -0
- data/ext/cld3/fixunicodevalue.cc +55 -0
- data/ext/cld3/fixunicodevalue.h +69 -0
- data/ext/cld3/float16.h +58 -0
- data/ext/cld3/fml_parser.cc +308 -0
- data/ext/cld3/fml_parser.h +123 -0
- data/ext/cld3/generated_entities.cc +296 -0
- data/ext/cld3/generated_ulscript.cc +678 -0
- data/ext/cld3/generated_ulscript.h +142 -0
- data/ext/cld3/getonescriptspan.cc +1109 -0
- data/ext/cld3/getonescriptspan.h +124 -0
- data/ext/cld3/integral_types.h +37 -0
- data/ext/cld3/lang_id_nn_params.cc +57449 -0
- data/ext/cld3/lang_id_nn_params.h +178 -0
- data/ext/cld3/language_identifier_features.cc +165 -0
- data/ext/cld3/language_identifier_features.h +116 -0
- data/ext/cld3/nnet_language_identifier.cc +380 -0
- data/ext/cld3/nnet_language_identifier.h +175 -0
- data/ext/cld3/nnet_language_identifier_c.cc +72 -0
- data/ext/cld3/offsetmap.cc +478 -0
- data/ext/cld3/offsetmap.h +168 -0
- data/ext/cld3/port.h +143 -0
- data/ext/cld3/registry.cc +28 -0
- data/ext/cld3/registry.h +242 -0
- data/ext/cld3/relevant_script_feature.cc +89 -0
- data/ext/cld3/relevant_script_feature.h +49 -0
- data/ext/cld3/script_detector.h +156 -0
- data/ext/cld3/sentence.proto +77 -0
- data/ext/cld3/sentence_features.cc +29 -0
- data/ext/cld3/sentence_features.h +35 -0
- data/ext/cld3/simple_adder.h +72 -0
- data/ext/cld3/stringpiece.h +81 -0
- data/ext/cld3/task_context.cc +161 -0
- data/ext/cld3/task_context.h +81 -0
- data/ext/cld3/task_context_params.cc +74 -0
- data/ext/cld3/task_context_params.h +54 -0
- data/ext/cld3/task_spec.proto +98 -0
- data/ext/cld3/text_processing.cc +245 -0
- data/ext/cld3/text_processing.h +30 -0
- data/ext/cld3/unicodetext.cc +96 -0
- data/ext/cld3/unicodetext.h +144 -0
- data/ext/cld3/utf8acceptinterchange.h +486 -0
- data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/utf8statetable.cc +1344 -0
- data/ext/cld3/utf8statetable.h +285 -0
- data/ext/cld3/utils.cc +241 -0
- data/ext/cld3/utils.h +144 -0
- data/ext/cld3/workspace.cc +64 -0
- data/ext/cld3/workspace.h +177 -0
- data/lib/cld3.rb +99 -0
- metadata +158 -0
@@ -0,0 +1,178 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#ifndef LANG_ID_NN_PARAMS_H_
|
17
|
+
#define LANG_ID_NN_PARAMS_H_
|
18
|
+
|
19
|
+
#include "base.h"
|
20
|
+
#include "embedding_network_params.h"
|
21
|
+
#include "float16.h"
|
22
|
+
|
23
|
+
namespace chrome_lang_id {
|
24
|
+
|
25
|
+
class LangIdNNParams : public EmbeddingNetworkParams {
|
26
|
+
public:
|
27
|
+
~LangIdNNParams() override {}
|
28
|
+
|
29
|
+
// Access methods for embeddings:
|
30
|
+
int embeddings_size() const override { return 6; }
|
31
|
+
int embeddings_num_rows(int i) const override {
|
32
|
+
return kEmbeddingsNumRows[i];
|
33
|
+
}
|
34
|
+
int embeddings_num_cols(int i) const override {
|
35
|
+
return kEmbeddingsNumCols[i];
|
36
|
+
}
|
37
|
+
const void *embeddings_weights(int i) const override {
|
38
|
+
return embeddings_weights_[i];
|
39
|
+
}
|
40
|
+
QuantizationType embeddings_quant_type(int i) const override {
|
41
|
+
return QuantizationType::UINT8;
|
42
|
+
}
|
43
|
+
const float16 *embeddings_quant_scales(int i) const override {
|
44
|
+
return embeddings_quant_scales_[i];
|
45
|
+
}
|
46
|
+
|
47
|
+
// Access methods for hidden:
|
48
|
+
int hidden_size() const override { return 1; }
|
49
|
+
int hidden_num_rows(int i) const override { return kHiddenNumRows[i]; }
|
50
|
+
int hidden_num_cols(int i) const override { return kHiddenNumCols[i]; }
|
51
|
+
const void *hidden_weights(int i) const override {
|
52
|
+
return hidden_weights_[i];
|
53
|
+
}
|
54
|
+
|
55
|
+
// Access methods for hidden_bias:
|
56
|
+
int hidden_bias_size() const override { return 1; }
|
57
|
+
int hidden_bias_num_rows(int i) const override {
|
58
|
+
return kHiddenBiasNumRows[i];
|
59
|
+
}
|
60
|
+
int hidden_bias_num_cols(int i) const override {
|
61
|
+
return kHiddenBiasNumCols[i];
|
62
|
+
}
|
63
|
+
const void *hidden_bias_weights(int i) const override {
|
64
|
+
return hidden_bias_weights_[i];
|
65
|
+
}
|
66
|
+
|
67
|
+
// Access methods for softmax:
|
68
|
+
int softmax_size() const override { return 1; }
|
69
|
+
int softmax_num_rows(int i) const override { return kSoftmaxNumRows[i]; }
|
70
|
+
int softmax_num_cols(int i) const override { return kSoftmaxNumCols[i]; }
|
71
|
+
const void *softmax_weights(int i) const override {
|
72
|
+
return softmax_weights_[i];
|
73
|
+
}
|
74
|
+
|
75
|
+
// Access methods for softmax_bias:
|
76
|
+
int softmax_bias_size() const override { return 1; }
|
77
|
+
int softmax_bias_num_rows(int i) const override {
|
78
|
+
return kSoftmaxBiasNumRows[i];
|
79
|
+
}
|
80
|
+
int softmax_bias_num_cols(int i) const override {
|
81
|
+
return kSoftmaxBiasNumCols[i];
|
82
|
+
}
|
83
|
+
const void *softmax_bias_weights(int i) const override {
|
84
|
+
return softmax_bias_weights_[i];
|
85
|
+
}
|
86
|
+
|
87
|
+
// Access methods for embedding_dim:
|
88
|
+
int embedding_dim_size() const override { return 6; }
|
89
|
+
int32 embedding_dim(int i) const override { return kEmbeddingDimValues[i]; }
|
90
|
+
|
91
|
+
// Access methods for embedding_num_features:
|
92
|
+
int embedding_num_features_size() const override { return 6; }
|
93
|
+
int32 embedding_num_features(int i) const override {
|
94
|
+
return kEmbeddingNumFeaturesValues[i];
|
95
|
+
}
|
96
|
+
|
97
|
+
// Access methods for embedding_features_domain_size:
|
98
|
+
int embedding_features_domain_size_size() const override { return 6; }
|
99
|
+
int32 embedding_features_domain_size(int i) const override {
|
100
|
+
return kEmbeddingFeaturesDomainSizeValues[i];
|
101
|
+
}
|
102
|
+
|
103
|
+
// Access methods for concat_offset:
|
104
|
+
int concat_offset_size() const override { return 6; }
|
105
|
+
int32 concat_offset(int i) const override { return kConcatOffsetValues[i]; }
|
106
|
+
|
107
|
+
// Access methods for concat_layer_size:
|
108
|
+
bool has_concat_layer_size() const override { return true; }
|
109
|
+
int32 concat_layer_size() const override { return 80; }
|
110
|
+
|
111
|
+
// Access methods for is_precomputed:
|
112
|
+
bool has_is_precomputed() const override { return false; }
|
113
|
+
bool is_precomputed() const override { return false; }
|
114
|
+
|
115
|
+
private:
|
116
|
+
// Private fields for embeddings:
|
117
|
+
static const int kEmbeddingsNumRows[];
|
118
|
+
static const int kEmbeddingsNumCols[];
|
119
|
+
static const uint8 kEmbeddingsWeights0[];
|
120
|
+
static const uint8 kEmbeddingsWeights1[];
|
121
|
+
static const uint8 kEmbeddingsWeights2[];
|
122
|
+
static const uint8 kEmbeddingsWeights3[];
|
123
|
+
static const uint8 kEmbeddingsWeights4[];
|
124
|
+
static const uint8 kEmbeddingsWeights5[];
|
125
|
+
const void *embeddings_weights_[6] = {
|
126
|
+
kEmbeddingsWeights0, kEmbeddingsWeights1, kEmbeddingsWeights2,
|
127
|
+
kEmbeddingsWeights3, kEmbeddingsWeights4, kEmbeddingsWeights5};
|
128
|
+
static const float16 kEmbeddingsQuantScales0[];
|
129
|
+
static const float16 kEmbeddingsQuantScales1[];
|
130
|
+
static const float16 kEmbeddingsQuantScales2[];
|
131
|
+
static const float16 kEmbeddingsQuantScales3[];
|
132
|
+
static const float16 kEmbeddingsQuantScales4[];
|
133
|
+
static const float16 kEmbeddingsQuantScales5[];
|
134
|
+
const float16 *embeddings_quant_scales_[6] = {
|
135
|
+
kEmbeddingsQuantScales0, kEmbeddingsQuantScales1,
|
136
|
+
kEmbeddingsQuantScales2, kEmbeddingsQuantScales3,
|
137
|
+
kEmbeddingsQuantScales4, kEmbeddingsQuantScales5};
|
138
|
+
|
139
|
+
// Private fields for hidden:
|
140
|
+
static const int kHiddenNumRows[];
|
141
|
+
static const int kHiddenNumCols[];
|
142
|
+
static const float kHiddenWeights0[];
|
143
|
+
const void *hidden_weights_[1] = {kHiddenWeights0};
|
144
|
+
|
145
|
+
// Private fields for hidden_bias:
|
146
|
+
static const int kHiddenBiasNumRows[];
|
147
|
+
static const int kHiddenBiasNumCols[];
|
148
|
+
static const float kHiddenBiasWeights0[];
|
149
|
+
const void *hidden_bias_weights_[1] = {kHiddenBiasWeights0};
|
150
|
+
|
151
|
+
// Private fields for softmax:
|
152
|
+
static const int kSoftmaxNumRows[];
|
153
|
+
static const int kSoftmaxNumCols[];
|
154
|
+
static const float kSoftmaxWeights0[];
|
155
|
+
const void *softmax_weights_[1] = {kSoftmaxWeights0};
|
156
|
+
|
157
|
+
// Private fields for softmax_bias:
|
158
|
+
static const int kSoftmaxBiasNumRows[];
|
159
|
+
static const int kSoftmaxBiasNumCols[];
|
160
|
+
static const float kSoftmaxBiasWeights0[];
|
161
|
+
const void *softmax_bias_weights_[1] = {kSoftmaxBiasWeights0};
|
162
|
+
|
163
|
+
// Private fields for embedding_dim:
|
164
|
+
static const int32 kEmbeddingDimValues[];
|
165
|
+
|
166
|
+
// Private fields for embedding_num_features:
|
167
|
+
static const int32 kEmbeddingNumFeaturesValues[];
|
168
|
+
|
169
|
+
// Private fields for embedding_features_domain_size:
|
170
|
+
static const int32 kEmbeddingFeaturesDomainSizeValues[];
|
171
|
+
|
172
|
+
// Private fields for concat_offset:
|
173
|
+
static const int32 kConcatOffsetValues[];
|
174
|
+
}; // class LangIdNNParams
|
175
|
+
|
176
|
+
} // namespace chrome_lang_id
|
177
|
+
|
178
|
+
#endif // LANG_ID_NN_PARAMS_H_
|
@@ -0,0 +1,165 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#include "language_identifier_features.h"
|
17
|
+
|
18
|
+
#include <sstream>
|
19
|
+
#include <unordered_map>
|
20
|
+
#include <utility>
|
21
|
+
#include <vector>
|
22
|
+
|
23
|
+
#include "base.h"
|
24
|
+
#include "feature_extractor.h"
|
25
|
+
#include "feature_types.h"
|
26
|
+
#include "script_span/generated_ulscript.h"
|
27
|
+
#include "script_span/getonescriptspan.h"
|
28
|
+
#include "sentence_features.h"
|
29
|
+
#include "task_context.h"
|
30
|
+
#include "unicodetext.h"
|
31
|
+
#include "utils.h"
|
32
|
+
|
33
|
+
namespace chrome_lang_id {
|
34
|
+
NumericFeatureType::NumericFeatureType(const string &name, FeatureValue size)
|
35
|
+
: FeatureType(name), size_(size) {}
|
36
|
+
|
37
|
+
string NumericFeatureType::GetFeatureValueName(FeatureValue value) const {
|
38
|
+
return value < 0 ? "" : Int64ToString(value);
|
39
|
+
}
|
40
|
+
|
41
|
+
FeatureValue NumericFeatureType::GetDomainSize() const { return size_; }
|
42
|
+
|
43
|
+
void ContinuousBagOfNgramsFunction::Setup(TaskContext *context) {
|
44
|
+
// Parameters in the feature function descriptor.
|
45
|
+
include_terminators_ = GetBoolParameter("include_terminators", false);
|
46
|
+
include_spaces_ = GetBoolParameter("include_spaces", false);
|
47
|
+
use_equal_ngram_weight_ = GetBoolParameter("use_equal_weight", false);
|
48
|
+
ngram_id_dimension_ = GetIntParameter("id_dim", 10000);
|
49
|
+
ngram_size_ = GetIntParameter("size", 3);
|
50
|
+
}
|
51
|
+
|
52
|
+
void ContinuousBagOfNgramsFunction::Init(TaskContext *context) {
|
53
|
+
set_feature_type(new NumericFeatureType(name(), ngram_id_dimension_));
|
54
|
+
}
|
55
|
+
|
56
|
+
void ContinuousBagOfNgramsFunction::Evaluate(const WorkspaceSet &workspaces,
|
57
|
+
const Sentence &sentence,
|
58
|
+
FeatureVector *result) const {
|
59
|
+
// Include terminators for each token. Tokens are discovered by splitting the
|
60
|
+
// text on spaces.
|
61
|
+
std::vector<string> chars;
|
62
|
+
utils::GetUTF8Chars(sentence.text(), &chars);
|
63
|
+
if (include_terminators_) {
|
64
|
+
std::vector<string> new_chars{"^"};
|
65
|
+
for (size_t index = 0; index < chars.size(); ++index) {
|
66
|
+
if (chars.at(index) == " ") {
|
67
|
+
new_chars.push_back("$");
|
68
|
+
new_chars.push_back(" ");
|
69
|
+
new_chars.push_back("^");
|
70
|
+
} else {
|
71
|
+
new_chars.push_back(chars.at(index));
|
72
|
+
}
|
73
|
+
}
|
74
|
+
new_chars.push_back("$");
|
75
|
+
chars.swap(new_chars);
|
76
|
+
}
|
77
|
+
|
78
|
+
// Find the char ngram counts.
|
79
|
+
std::unordered_map<string, int> char_ngram_counts;
|
80
|
+
int count_sum = 0;
|
81
|
+
for (int start = 0; start <= static_cast<int>(chars.size()) - ngram_size_;
|
82
|
+
++start) {
|
83
|
+
string char_ngram;
|
84
|
+
int index;
|
85
|
+
for (index = 0; index < ngram_size_; ++index) {
|
86
|
+
const string ¤t_char = chars.at(start + index);
|
87
|
+
if (current_char == " " && !include_spaces_) {
|
88
|
+
break;
|
89
|
+
}
|
90
|
+
char_ngram.append(current_char);
|
91
|
+
}
|
92
|
+
if (index == ngram_size_) {
|
93
|
+
char_ngram_counts[char_ngram]++;
|
94
|
+
++count_sum;
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
// Populate the feature vector.
|
99
|
+
const float equal_weight = 1.0 / char_ngram_counts.size();
|
100
|
+
const float norm = static_cast<float>(count_sum);
|
101
|
+
for (const auto &ngram_and_count : char_ngram_counts) {
|
102
|
+
const float weight =
|
103
|
+
use_equal_ngram_weight_ ? equal_weight : ngram_and_count.second / norm;
|
104
|
+
FloatFeatureValue value(
|
105
|
+
utils::Hash32WithDefaultSeed(ngram_and_count.first) %
|
106
|
+
ngram_id_dimension_,
|
107
|
+
weight);
|
108
|
+
result->add(feature_type(), value.discrete_value);
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
FeatureValue ScriptFeature::Compute(const WorkspaceSet &workspaces,
|
113
|
+
const Sentence &sentence,
|
114
|
+
const FeatureVector *result) const {
|
115
|
+
const string &text = sentence.text();
|
116
|
+
CLD2::ScriptScanner ss(text.c_str(), text.size(),
|
117
|
+
/*is_plain_text=*/true);
|
118
|
+
|
119
|
+
// GetOneScriptSpan() is called only once because of the assumption that the
|
120
|
+
// input contains one script. This function also cleans up the input (e.g.,
|
121
|
+
// removes digits, punctuation).
|
122
|
+
// TODO(abakalov): Extract the clean-up and script detection code out of
|
123
|
+
// GetOneScriptSpan() because we don't have to iterate over the whole text,
|
124
|
+
// just look at the first codepoint after clean-up.
|
125
|
+
CLD2::LangSpan script_span;
|
126
|
+
ss.GetOneScriptSpan(&script_span);
|
127
|
+
const CLD2::ULScript ulscript = script_span.ulscript;
|
128
|
+
if (ulscript != CLD2::ULScript_Hani) {
|
129
|
+
return ulscript;
|
130
|
+
} else {
|
131
|
+
// Out of the codepoints captured by ULScript_Hani, separately count those
|
132
|
+
// in Hangul (Korean script) and those in a script other than Hangul.
|
133
|
+
int num_hangul = 0;
|
134
|
+
int num_non_hangul = 0;
|
135
|
+
UnicodeText unicode_text;
|
136
|
+
unicode_text.PointToUTF8(script_span.text, script_span.text_bytes);
|
137
|
+
for (chrome_lang_id::char32 codepoint : unicode_text) {
|
138
|
+
// If the current codepoint is space, continue.
|
139
|
+
if (codepoint == 0x20) {
|
140
|
+
continue;
|
141
|
+
}
|
142
|
+
|
143
|
+
// Check if the current codepoint is within the ranges associated with
|
144
|
+
// Hangul.
|
145
|
+
if ((codepoint >= 0x1100 && codepoint <= 0x11FF) || // Hangul Jamo
|
146
|
+
(codepoint >= 0xA960 && codepoint <= 0xA97F) || // Jamo Extended A
|
147
|
+
(codepoint >= 0xD7B0 && codepoint <= 0xD7FF) || // Jamo Extended B
|
148
|
+
(codepoint >= 0x3130 && codepoint <= 0x318F) || // Compatibility Jamo
|
149
|
+
(codepoint >= 0xFFA0 && codepoint <= 0xFFDC) || // Halfwidth Jamo
|
150
|
+
(codepoint >= 0xAC00 && codepoint <= 0xD7AF)) { // Hangul Syllables
|
151
|
+
num_hangul++;
|
152
|
+
} else {
|
153
|
+
num_non_hangul++;
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
157
|
+
if (num_hangul > num_non_hangul) {
|
158
|
+
return static_cast<FeatureValue>(CLD2::NUM_ULSCRIPTS);
|
159
|
+
} else {
|
160
|
+
return static_cast<FeatureValue>(CLD2::ULScript_Hani);
|
161
|
+
}
|
162
|
+
}
|
163
|
+
}
|
164
|
+
|
165
|
+
} // namespace chrome_lang_id
|
@@ -0,0 +1,116 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#ifndef LANGUAGE_IDENTIFIER_FEATURES_H_
|
17
|
+
#define LANGUAGE_IDENTIFIER_FEATURES_H_
|
18
|
+
|
19
|
+
#include <string>
|
20
|
+
|
21
|
+
#include "feature_extractor.h"
|
22
|
+
#include "feature_types.h"
|
23
|
+
#include "script_span/generated_ulscript.h"
|
24
|
+
#include "cld_3/protos/sentence.pb.h"
|
25
|
+
#include "sentence_features.h"
|
26
|
+
#include "task_context.h"
|
27
|
+
#include "workspace.h"
|
28
|
+
|
29
|
+
namespace chrome_lang_id {
|
30
|
+
|
31
|
+
// Feature type for numeric features.
|
32
|
+
class NumericFeatureType : public FeatureType {
|
33
|
+
public:
|
34
|
+
// Initializes numeric feature.
|
35
|
+
NumericFeatureType(const string &name, FeatureValue size);
|
36
|
+
|
37
|
+
// Returns numeric feature value.
|
38
|
+
string GetFeatureValueName(FeatureValue value) const override;
|
39
|
+
|
40
|
+
// Returns the number of feature values.
|
41
|
+
FeatureValue GetDomainSize() const override;
|
42
|
+
|
43
|
+
private:
|
44
|
+
FeatureValue size_;
|
45
|
+
};
|
46
|
+
|
47
|
+
// Class for computing continuous char ngram features.
|
48
|
+
// Feature function descriptor parameters:
|
49
|
+
// include_terminators(bool, false):
|
50
|
+
// If 'true', then splits the text based on spaces to get tokens, adds "^"
|
51
|
+
// to the beginning of each token, and adds "$" to the end of each token.
|
52
|
+
// include_spaces(bool, false):
|
53
|
+
// If 'true', then includes char ngrams containing spaces.
|
54
|
+
// use_equal_weight(bool, false):
|
55
|
+
// If 'true', then weighs each unique ngram by 1.0 / (number of unique
|
56
|
+
// ngrams in the input). Otherwise, weighs each unique ngram by (ngram
|
57
|
+
// count) / (total number of ngrams).
|
58
|
+
// id_dim(int, 10000):
|
59
|
+
// The integer id of each char ngram is computed as follows:
|
60
|
+
// Hash32WithDefaultSeed(char ngram) % id_dim.
|
61
|
+
// size(int, 3):
|
62
|
+
// Only ngrams of this size will be extracted.
|
63
|
+
class ContinuousBagOfNgramsFunction : public WholeSentenceFeature {
|
64
|
+
public:
|
65
|
+
void Setup(TaskContext *context) override;
|
66
|
+
void Init(TaskContext *context) override;
|
67
|
+
|
68
|
+
// Appends the features computed from the focus to the feature vector.
|
69
|
+
void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
|
70
|
+
FeatureVector *result) const override;
|
71
|
+
|
72
|
+
private:
|
73
|
+
// If 'true', then splits the text based on spaces to get tokens, adds "^" to
|
74
|
+
// the beginning of each token, and adds "$" to the end of each token.
|
75
|
+
bool include_terminators_;
|
76
|
+
|
77
|
+
// If 'true', then includes char ngrams containing spaces.
|
78
|
+
bool include_spaces_;
|
79
|
+
|
80
|
+
// If 'true', then weighs each unique ngram by 1.0 / (number of unique ngrams
|
81
|
+
// in the input). Otherwise, weighs each unique ngram by (ngram count) /
|
82
|
+
// (total number of ngrams).
|
83
|
+
bool use_equal_ngram_weight_;
|
84
|
+
|
85
|
+
// The integer id of each char ngram is computed as follows:
|
86
|
+
// Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
|
87
|
+
int ngram_id_dimension_;
|
88
|
+
|
89
|
+
// Only ngrams of size ngram_size_ will be extracted.
|
90
|
+
int ngram_size_;
|
91
|
+
};
|
92
|
+
|
93
|
+
// Class for detecting the script of a piece of text. The list of supported
|
94
|
+
// scripts is chrome_lang_id::CLD2::ULScript. This class uses the script
|
95
|
+
// recognition code ported from CLD2. ULScript_Hani is split into non-Korean
|
96
|
+
// script and Korean script (Hangul). In the former case, the function emits
|
97
|
+
// ULScript_Hani. In the latter case, the function emits NUM_ULSCRIPTS. The
|
98
|
+
// class assumes that the input is (1) interchange valid UTF8, and (2) contains
|
99
|
+
// only one chrome_lang_id::CLD2::ULScript.
|
100
|
+
class ScriptFeature : public WholeSentenceFeature {
|
101
|
+
public:
|
102
|
+
void Init(TaskContext *context) override {
|
103
|
+
// The dimension is incremented by 1 because ULScript_Hani is split into two
|
104
|
+
// as mentioned in the class description.
|
105
|
+
set_feature_type(new NumericFeatureType(
|
106
|
+
name(), chrome_lang_id::CLD2::NUM_ULSCRIPTS + 1));
|
107
|
+
}
|
108
|
+
|
109
|
+
// Computes the feature and saves it in the feature vector.
|
110
|
+
FeatureValue Compute(const WorkspaceSet &workspaces, const Sentence &sentence,
|
111
|
+
const FeatureVector *result) const override;
|
112
|
+
};
|
113
|
+
|
114
|
+
} // namespace chrome_lang_id
|
115
|
+
|
116
|
+
#endif // LANGUAGE_IDENTIFIER_FEATURES_H_
|