cld3 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +18 -0
  3. data/LICENSE +204 -0
  4. data/LICENSE_CLD3 +203 -0
  5. data/README.md +22 -0
  6. data/cld3.gemspec +35 -0
  7. data/ext/cld3/base.cc +36 -0
  8. data/ext/cld3/base.h +106 -0
  9. data/ext/cld3/casts.h +98 -0
  10. data/ext/cld3/embedding_feature_extractor.cc +51 -0
  11. data/ext/cld3/embedding_feature_extractor.h +182 -0
  12. data/ext/cld3/embedding_network.cc +196 -0
  13. data/ext/cld3/embedding_network.h +186 -0
  14. data/ext/cld3/embedding_network_params.h +285 -0
  15. data/ext/cld3/extconf.rb +49 -0
  16. data/ext/cld3/feature_extractor.cc +137 -0
  17. data/ext/cld3/feature_extractor.h +633 -0
  18. data/ext/cld3/feature_extractor.proto +50 -0
  19. data/ext/cld3/feature_types.cc +72 -0
  20. data/ext/cld3/feature_types.h +158 -0
  21. data/ext/cld3/fixunicodevalue.cc +55 -0
  22. data/ext/cld3/fixunicodevalue.h +69 -0
  23. data/ext/cld3/float16.h +58 -0
  24. data/ext/cld3/fml_parser.cc +308 -0
  25. data/ext/cld3/fml_parser.h +123 -0
  26. data/ext/cld3/generated_entities.cc +296 -0
  27. data/ext/cld3/generated_ulscript.cc +678 -0
  28. data/ext/cld3/generated_ulscript.h +142 -0
  29. data/ext/cld3/getonescriptspan.cc +1109 -0
  30. data/ext/cld3/getonescriptspan.h +124 -0
  31. data/ext/cld3/integral_types.h +37 -0
  32. data/ext/cld3/lang_id_nn_params.cc +57449 -0
  33. data/ext/cld3/lang_id_nn_params.h +178 -0
  34. data/ext/cld3/language_identifier_features.cc +165 -0
  35. data/ext/cld3/language_identifier_features.h +116 -0
  36. data/ext/cld3/nnet_language_identifier.cc +380 -0
  37. data/ext/cld3/nnet_language_identifier.h +175 -0
  38. data/ext/cld3/nnet_language_identifier_c.cc +72 -0
  39. data/ext/cld3/offsetmap.cc +478 -0
  40. data/ext/cld3/offsetmap.h +168 -0
  41. data/ext/cld3/port.h +143 -0
  42. data/ext/cld3/registry.cc +28 -0
  43. data/ext/cld3/registry.h +242 -0
  44. data/ext/cld3/relevant_script_feature.cc +89 -0
  45. data/ext/cld3/relevant_script_feature.h +49 -0
  46. data/ext/cld3/script_detector.h +156 -0
  47. data/ext/cld3/sentence.proto +77 -0
  48. data/ext/cld3/sentence_features.cc +29 -0
  49. data/ext/cld3/sentence_features.h +35 -0
  50. data/ext/cld3/simple_adder.h +72 -0
  51. data/ext/cld3/stringpiece.h +81 -0
  52. data/ext/cld3/task_context.cc +161 -0
  53. data/ext/cld3/task_context.h +81 -0
  54. data/ext/cld3/task_context_params.cc +74 -0
  55. data/ext/cld3/task_context_params.h +54 -0
  56. data/ext/cld3/task_spec.proto +98 -0
  57. data/ext/cld3/text_processing.cc +245 -0
  58. data/ext/cld3/text_processing.h +30 -0
  59. data/ext/cld3/unicodetext.cc +96 -0
  60. data/ext/cld3/unicodetext.h +144 -0
  61. data/ext/cld3/utf8acceptinterchange.h +486 -0
  62. data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
  63. data/ext/cld3/utf8repl_lettermarklower.h +758 -0
  64. data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
  65. data/ext/cld3/utf8statetable.cc +1344 -0
  66. data/ext/cld3/utf8statetable.h +285 -0
  67. data/ext/cld3/utils.cc +241 -0
  68. data/ext/cld3/utils.h +144 -0
  69. data/ext/cld3/workspace.cc +64 -0
  70. data/ext/cld3/workspace.h +177 -0
  71. data/lib/cld3.rb +99 -0
  72. metadata +158 -0
@@ -0,0 +1,178 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #ifndef LANG_ID_NN_PARAMS_H_
17
+ #define LANG_ID_NN_PARAMS_H_
18
+
19
+ #include "base.h"
20
+ #include "embedding_network_params.h"
21
+ #include "float16.h"
22
+
23
+ namespace chrome_lang_id {
24
+
25
+ class LangIdNNParams : public EmbeddingNetworkParams {
26
+ public:
27
+ ~LangIdNNParams() override {}
28
+
29
+ // Access methods for embeddings:
30
+ int embeddings_size() const override { return 6; }
31
+ int embeddings_num_rows(int i) const override {
32
+ return kEmbeddingsNumRows[i];
33
+ }
34
+ int embeddings_num_cols(int i) const override {
35
+ return kEmbeddingsNumCols[i];
36
+ }
37
+ const void *embeddings_weights(int i) const override {
38
+ return embeddings_weights_[i];
39
+ }
40
+ QuantizationType embeddings_quant_type(int i) const override {
41
+ return QuantizationType::UINT8;
42
+ }
43
+ const float16 *embeddings_quant_scales(int i) const override {
44
+ return embeddings_quant_scales_[i];
45
+ }
46
+
47
+ // Access methods for hidden:
48
+ int hidden_size() const override { return 1; }
49
+ int hidden_num_rows(int i) const override { return kHiddenNumRows[i]; }
50
+ int hidden_num_cols(int i) const override { return kHiddenNumCols[i]; }
51
+ const void *hidden_weights(int i) const override {
52
+ return hidden_weights_[i];
53
+ }
54
+
55
+ // Access methods for hidden_bias:
56
+ int hidden_bias_size() const override { return 1; }
57
+ int hidden_bias_num_rows(int i) const override {
58
+ return kHiddenBiasNumRows[i];
59
+ }
60
+ int hidden_bias_num_cols(int i) const override {
61
+ return kHiddenBiasNumCols[i];
62
+ }
63
+ const void *hidden_bias_weights(int i) const override {
64
+ return hidden_bias_weights_[i];
65
+ }
66
+
67
+ // Access methods for softmax:
68
+ int softmax_size() const override { return 1; }
69
+ int softmax_num_rows(int i) const override { return kSoftmaxNumRows[i]; }
70
+ int softmax_num_cols(int i) const override { return kSoftmaxNumCols[i]; }
71
+ const void *softmax_weights(int i) const override {
72
+ return softmax_weights_[i];
73
+ }
74
+
75
+ // Access methods for softmax_bias:
76
+ int softmax_bias_size() const override { return 1; }
77
+ int softmax_bias_num_rows(int i) const override {
78
+ return kSoftmaxBiasNumRows[i];
79
+ }
80
+ int softmax_bias_num_cols(int i) const override {
81
+ return kSoftmaxBiasNumCols[i];
82
+ }
83
+ const void *softmax_bias_weights(int i) const override {
84
+ return softmax_bias_weights_[i];
85
+ }
86
+
87
+ // Access methods for embedding_dim:
88
+ int embedding_dim_size() const override { return 6; }
89
+ int32 embedding_dim(int i) const override { return kEmbeddingDimValues[i]; }
90
+
91
+ // Access methods for embedding_num_features:
92
+ int embedding_num_features_size() const override { return 6; }
93
+ int32 embedding_num_features(int i) const override {
94
+ return kEmbeddingNumFeaturesValues[i];
95
+ }
96
+
97
+ // Access methods for embedding_features_domain_size:
98
+ int embedding_features_domain_size_size() const override { return 6; }
99
+ int32 embedding_features_domain_size(int i) const override {
100
+ return kEmbeddingFeaturesDomainSizeValues[i];
101
+ }
102
+
103
+ // Access methods for concat_offset:
104
+ int concat_offset_size() const override { return 6; }
105
+ int32 concat_offset(int i) const override { return kConcatOffsetValues[i]; }
106
+
107
+ // Access methods for concat_layer_size:
108
+ bool has_concat_layer_size() const override { return true; }
109
+ int32 concat_layer_size() const override { return 80; }
110
+
111
+ // Access methods for is_precomputed:
112
+ bool has_is_precomputed() const override { return false; }
113
+ bool is_precomputed() const override { return false; }
114
+
115
+ private:
116
+ // Private fields for embeddings:
117
+ static const int kEmbeddingsNumRows[];
118
+ static const int kEmbeddingsNumCols[];
119
+ static const uint8 kEmbeddingsWeights0[];
120
+ static const uint8 kEmbeddingsWeights1[];
121
+ static const uint8 kEmbeddingsWeights2[];
122
+ static const uint8 kEmbeddingsWeights3[];
123
+ static const uint8 kEmbeddingsWeights4[];
124
+ static const uint8 kEmbeddingsWeights5[];
125
+ const void *embeddings_weights_[6] = {
126
+ kEmbeddingsWeights0, kEmbeddingsWeights1, kEmbeddingsWeights2,
127
+ kEmbeddingsWeights3, kEmbeddingsWeights4, kEmbeddingsWeights5};
128
+ static const float16 kEmbeddingsQuantScales0[];
129
+ static const float16 kEmbeddingsQuantScales1[];
130
+ static const float16 kEmbeddingsQuantScales2[];
131
+ static const float16 kEmbeddingsQuantScales3[];
132
+ static const float16 kEmbeddingsQuantScales4[];
133
+ static const float16 kEmbeddingsQuantScales5[];
134
+ const float16 *embeddings_quant_scales_[6] = {
135
+ kEmbeddingsQuantScales0, kEmbeddingsQuantScales1,
136
+ kEmbeddingsQuantScales2, kEmbeddingsQuantScales3,
137
+ kEmbeddingsQuantScales4, kEmbeddingsQuantScales5};
138
+
139
+ // Private fields for hidden:
140
+ static const int kHiddenNumRows[];
141
+ static const int kHiddenNumCols[];
142
+ static const float kHiddenWeights0[];
143
+ const void *hidden_weights_[1] = {kHiddenWeights0};
144
+
145
+ // Private fields for hidden_bias:
146
+ static const int kHiddenBiasNumRows[];
147
+ static const int kHiddenBiasNumCols[];
148
+ static const float kHiddenBiasWeights0[];
149
+ const void *hidden_bias_weights_[1] = {kHiddenBiasWeights0};
150
+
151
+ // Private fields for softmax:
152
+ static const int kSoftmaxNumRows[];
153
+ static const int kSoftmaxNumCols[];
154
+ static const float kSoftmaxWeights0[];
155
+ const void *softmax_weights_[1] = {kSoftmaxWeights0};
156
+
157
+ // Private fields for softmax_bias:
158
+ static const int kSoftmaxBiasNumRows[];
159
+ static const int kSoftmaxBiasNumCols[];
160
+ static const float kSoftmaxBiasWeights0[];
161
+ const void *softmax_bias_weights_[1] = {kSoftmaxBiasWeights0};
162
+
163
+ // Private fields for embedding_dim:
164
+ static const int32 kEmbeddingDimValues[];
165
+
166
+ // Private fields for embedding_num_features:
167
+ static const int32 kEmbeddingNumFeaturesValues[];
168
+
169
+ // Private fields for embedding_features_domain_size:
170
+ static const int32 kEmbeddingFeaturesDomainSizeValues[];
171
+
172
+ // Private fields for concat_offset:
173
+ static const int32 kConcatOffsetValues[];
174
+ }; // class LangIdNNParams
175
+
176
+ } // namespace chrome_lang_id
177
+
178
+ #endif // LANG_ID_NN_PARAMS_H_
@@ -0,0 +1,165 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #include "language_identifier_features.h"
17
+
18
+ #include <sstream>
19
+ #include <unordered_map>
20
+ #include <utility>
21
+ #include <vector>
22
+
23
+ #include "base.h"
24
+ #include "feature_extractor.h"
25
+ #include "feature_types.h"
26
+ #include "script_span/generated_ulscript.h"
27
+ #include "script_span/getonescriptspan.h"
28
+ #include "sentence_features.h"
29
+ #include "task_context.h"
30
+ #include "unicodetext.h"
31
+ #include "utils.h"
32
+
33
+ namespace chrome_lang_id {
34
+ NumericFeatureType::NumericFeatureType(const string &name, FeatureValue size)
35
+ : FeatureType(name), size_(size) {}
36
+
37
+ string NumericFeatureType::GetFeatureValueName(FeatureValue value) const {
38
+ return value < 0 ? "" : Int64ToString(value);
39
+ }
40
+
41
+ FeatureValue NumericFeatureType::GetDomainSize() const { return size_; }
42
+
43
+ void ContinuousBagOfNgramsFunction::Setup(TaskContext *context) {
44
+ // Parameters in the feature function descriptor.
45
+ include_terminators_ = GetBoolParameter("include_terminators", false);
46
+ include_spaces_ = GetBoolParameter("include_spaces", false);
47
+ use_equal_ngram_weight_ = GetBoolParameter("use_equal_weight", false);
48
+ ngram_id_dimension_ = GetIntParameter("id_dim", 10000);
49
+ ngram_size_ = GetIntParameter("size", 3);
50
+ }
51
+
52
+ void ContinuousBagOfNgramsFunction::Init(TaskContext *context) {
53
+ set_feature_type(new NumericFeatureType(name(), ngram_id_dimension_));
54
+ }
55
+
56
+ void ContinuousBagOfNgramsFunction::Evaluate(const WorkspaceSet &workspaces,
57
+ const Sentence &sentence,
58
+ FeatureVector *result) const {
59
+ // Include terminators for each token. Tokens are discovered by splitting the
60
+ // text on spaces.
61
+ std::vector<string> chars;
62
+ utils::GetUTF8Chars(sentence.text(), &chars);
63
+ if (include_terminators_) {
64
+ std::vector<string> new_chars{"^"};
65
+ for (size_t index = 0; index < chars.size(); ++index) {
66
+ if (chars.at(index) == " ") {
67
+ new_chars.push_back("$");
68
+ new_chars.push_back(" ");
69
+ new_chars.push_back("^");
70
+ } else {
71
+ new_chars.push_back(chars.at(index));
72
+ }
73
+ }
74
+ new_chars.push_back("$");
75
+ chars.swap(new_chars);
76
+ }
77
+
78
+ // Find the char ngram counts.
79
+ std::unordered_map<string, int> char_ngram_counts;
80
+ int count_sum = 0;
81
+ for (int start = 0; start <= static_cast<int>(chars.size()) - ngram_size_;
82
+ ++start) {
83
+ string char_ngram;
84
+ int index;
85
+ for (index = 0; index < ngram_size_; ++index) {
86
+ const string &current_char = chars.at(start + index);
87
+ if (current_char == " " && !include_spaces_) {
88
+ break;
89
+ }
90
+ char_ngram.append(current_char);
91
+ }
92
+ if (index == ngram_size_) {
93
+ char_ngram_counts[char_ngram]++;
94
+ ++count_sum;
95
+ }
96
+ }
97
+
98
+ // Populate the feature vector.
99
+ const float equal_weight = 1.0 / char_ngram_counts.size();
100
+ const float norm = static_cast<float>(count_sum);
101
+ for (const auto &ngram_and_count : char_ngram_counts) {
102
+ const float weight =
103
+ use_equal_ngram_weight_ ? equal_weight : ngram_and_count.second / norm;
104
+ FloatFeatureValue value(
105
+ utils::Hash32WithDefaultSeed(ngram_and_count.first) %
106
+ ngram_id_dimension_,
107
+ weight);
108
+ result->add(feature_type(), value.discrete_value);
109
+ }
110
+ }
111
+
112
+ FeatureValue ScriptFeature::Compute(const WorkspaceSet &workspaces,
113
+ const Sentence &sentence,
114
+ const FeatureVector *result) const {
115
+ const string &text = sentence.text();
116
+ CLD2::ScriptScanner ss(text.c_str(), text.size(),
117
+ /*is_plain_text=*/true);
118
+
119
+ // GetOneScriptSpan() is called only once because of the assumption that the
120
+ // input contains one script. This function also cleans up the input (e.g.,
121
+ // removes digits, punctuation).
122
+ // TODO(abakalov): Extract the clean-up and script detection code out of
123
+ // GetOneScriptSpan() because we don't have to iterate over the whole text,
124
+ // just look at the first codepoint after clean-up.
125
+ CLD2::LangSpan script_span;
126
+ ss.GetOneScriptSpan(&script_span);
127
+ const CLD2::ULScript ulscript = script_span.ulscript;
128
+ if (ulscript != CLD2::ULScript_Hani) {
129
+ return ulscript;
130
+ } else {
131
+ // Out of the codepoints captured by ULScript_Hani, separately count those
132
+ // in Hangul (Korean script) and those in a script other than Hangul.
133
+ int num_hangul = 0;
134
+ int num_non_hangul = 0;
135
+ UnicodeText unicode_text;
136
+ unicode_text.PointToUTF8(script_span.text, script_span.text_bytes);
137
+ for (chrome_lang_id::char32 codepoint : unicode_text) {
138
+ // If the current codepoint is space, continue.
139
+ if (codepoint == 0x20) {
140
+ continue;
141
+ }
142
+
143
+ // Check if the current codepoint is within the ranges associated with
144
+ // Hangul.
145
+ if ((codepoint >= 0x1100 && codepoint <= 0x11FF) || // Hangul Jamo
146
+ (codepoint >= 0xA960 && codepoint <= 0xA97F) || // Jamo Extended A
147
+ (codepoint >= 0xD7B0 && codepoint <= 0xD7FF) || // Jamo Extended B
148
+ (codepoint >= 0x3130 && codepoint <= 0x318F) || // Compatibility Jamo
149
+ (codepoint >= 0xFFA0 && codepoint <= 0xFFDC) || // Halfwidth Jamo
150
+ (codepoint >= 0xAC00 && codepoint <= 0xD7AF)) { // Hangul Syllables
151
+ num_hangul++;
152
+ } else {
153
+ num_non_hangul++;
154
+ }
155
+ }
156
+
157
+ if (num_hangul > num_non_hangul) {
158
+ return static_cast<FeatureValue>(CLD2::NUM_ULSCRIPTS);
159
+ } else {
160
+ return static_cast<FeatureValue>(CLD2::ULScript_Hani);
161
+ }
162
+ }
163
+ }
164
+
165
+ } // namespace chrome_lang_id
@@ -0,0 +1,116 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #ifndef LANGUAGE_IDENTIFIER_FEATURES_H_
17
+ #define LANGUAGE_IDENTIFIER_FEATURES_H_
18
+
19
+ #include <string>
20
+
21
+ #include "feature_extractor.h"
22
+ #include "feature_types.h"
23
+ #include "script_span/generated_ulscript.h"
24
+ #include "cld_3/protos/sentence.pb.h"
25
+ #include "sentence_features.h"
26
+ #include "task_context.h"
27
+ #include "workspace.h"
28
+
29
+ namespace chrome_lang_id {
30
+
31
+ // Feature type for numeric features.
32
+ class NumericFeatureType : public FeatureType {
33
+ public:
34
+ // Initializes numeric feature.
35
+ NumericFeatureType(const string &name, FeatureValue size);
36
+
37
+ // Returns numeric feature value.
38
+ string GetFeatureValueName(FeatureValue value) const override;
39
+
40
+ // Returns the number of feature values.
41
+ FeatureValue GetDomainSize() const override;
42
+
43
+ private:
44
+ FeatureValue size_;
45
+ };
46
+
47
+ // Class for computing continuous char ngram features.
48
+ // Feature function descriptor parameters:
49
+ // include_terminators(bool, false):
50
+ // If 'true', then splits the text based on spaces to get tokens, adds "^"
51
+ // to the beginning of each token, and adds "$" to the end of each token.
52
+ // include_spaces(bool, false):
53
+ // If 'true', then includes char ngrams containing spaces.
54
+ // use_equal_weight(bool, false):
55
+ // If 'true', then weighs each unique ngram by 1.0 / (number of unique
56
+ // ngrams in the input). Otherwise, weighs each unique ngram by (ngram
57
+ // count) / (total number of ngrams).
58
+ // id_dim(int, 10000):
59
+ // The integer id of each char ngram is computed as follows:
60
+ // Hash32WithDefaultSeed(char ngram) % id_dim.
61
+ // size(int, 3):
62
+ // Only ngrams of this size will be extracted.
63
+ class ContinuousBagOfNgramsFunction : public WholeSentenceFeature {
64
+ public:
65
+ void Setup(TaskContext *context) override;
66
+ void Init(TaskContext *context) override;
67
+
68
+ // Appends the features computed from the focus to the feature vector.
69
+ void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
70
+ FeatureVector *result) const override;
71
+
72
+ private:
73
+ // If 'true', then splits the text based on spaces to get tokens, adds "^" to
74
+ // the beginning of each token, and adds "$" to the end of each token.
75
+ bool include_terminators_;
76
+
77
+ // If 'true', then includes char ngrams containing spaces.
78
+ bool include_spaces_;
79
+
80
+ // If 'true', then weighs each unique ngram by 1.0 / (number of unique ngrams
81
+ // in the input). Otherwise, weighs each unique ngram by (ngram count) /
82
+ // (total number of ngrams).
83
+ bool use_equal_ngram_weight_;
84
+
85
+ // The integer id of each char ngram is computed as follows:
86
+ // Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
87
+ int ngram_id_dimension_;
88
+
89
+ // Only ngrams of size ngram_size_ will be extracted.
90
+ int ngram_size_;
91
+ };
92
+
93
+ // Class for detecting the script of a piece of text. The list of supported
94
+ // scripts is chrome_lang_id::CLD2::ULScript. This class uses the script
95
+ // recognition code ported from CLD2. ULScript_Hani is split into non-Korean
96
+ // script and Korean script (Hangul). In the former case, the function emits
97
+ // ULScript_Hani. In the latter case, the function emits NUM_ULSCRIPTS. The
98
+ // class assumes that the input is (1) interchange valid UTF8, and (2) contains
99
+ // only one chrome_lang_id::CLD2::ULScript.
100
+ class ScriptFeature : public WholeSentenceFeature {
101
+ public:
102
+ void Init(TaskContext *context) override {
103
+ // The dimension is incremented by 1 because ULScript_Hani is split into two
104
+ // as mentioned in the class description.
105
+ set_feature_type(new NumericFeatureType(
106
+ name(), chrome_lang_id::CLD2::NUM_ULSCRIPTS + 1));
107
+ }
108
+
109
+ // Computes the feature and saves it in the feature vector.
110
+ FeatureValue Compute(const WorkspaceSet &workspaces, const Sentence &sentence,
111
+ const FeatureVector *result) const override;
112
+ };
113
+
114
+ } // namespace chrome_lang_id
115
+
116
+ #endif // LANGUAGE_IDENTIFIER_FEATURES_H_