cld3 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/LICENSE +204 -0
- data/LICENSE_CLD3 +203 -0
- data/README.md +22 -0
- data/cld3.gemspec +35 -0
- data/ext/cld3/base.cc +36 -0
- data/ext/cld3/base.h +106 -0
- data/ext/cld3/casts.h +98 -0
- data/ext/cld3/embedding_feature_extractor.cc +51 -0
- data/ext/cld3/embedding_feature_extractor.h +182 -0
- data/ext/cld3/embedding_network.cc +196 -0
- data/ext/cld3/embedding_network.h +186 -0
- data/ext/cld3/embedding_network_params.h +285 -0
- data/ext/cld3/extconf.rb +49 -0
- data/ext/cld3/feature_extractor.cc +137 -0
- data/ext/cld3/feature_extractor.h +633 -0
- data/ext/cld3/feature_extractor.proto +50 -0
- data/ext/cld3/feature_types.cc +72 -0
- data/ext/cld3/feature_types.h +158 -0
- data/ext/cld3/fixunicodevalue.cc +55 -0
- data/ext/cld3/fixunicodevalue.h +69 -0
- data/ext/cld3/float16.h +58 -0
- data/ext/cld3/fml_parser.cc +308 -0
- data/ext/cld3/fml_parser.h +123 -0
- data/ext/cld3/generated_entities.cc +296 -0
- data/ext/cld3/generated_ulscript.cc +678 -0
- data/ext/cld3/generated_ulscript.h +142 -0
- data/ext/cld3/getonescriptspan.cc +1109 -0
- data/ext/cld3/getonescriptspan.h +124 -0
- data/ext/cld3/integral_types.h +37 -0
- data/ext/cld3/lang_id_nn_params.cc +57449 -0
- data/ext/cld3/lang_id_nn_params.h +178 -0
- data/ext/cld3/language_identifier_features.cc +165 -0
- data/ext/cld3/language_identifier_features.h +116 -0
- data/ext/cld3/nnet_language_identifier.cc +380 -0
- data/ext/cld3/nnet_language_identifier.h +175 -0
- data/ext/cld3/nnet_language_identifier_c.cc +72 -0
- data/ext/cld3/offsetmap.cc +478 -0
- data/ext/cld3/offsetmap.h +168 -0
- data/ext/cld3/port.h +143 -0
- data/ext/cld3/registry.cc +28 -0
- data/ext/cld3/registry.h +242 -0
- data/ext/cld3/relevant_script_feature.cc +89 -0
- data/ext/cld3/relevant_script_feature.h +49 -0
- data/ext/cld3/script_detector.h +156 -0
- data/ext/cld3/sentence.proto +77 -0
- data/ext/cld3/sentence_features.cc +29 -0
- data/ext/cld3/sentence_features.h +35 -0
- data/ext/cld3/simple_adder.h +72 -0
- data/ext/cld3/stringpiece.h +81 -0
- data/ext/cld3/task_context.cc +161 -0
- data/ext/cld3/task_context.h +81 -0
- data/ext/cld3/task_context_params.cc +74 -0
- data/ext/cld3/task_context_params.h +54 -0
- data/ext/cld3/task_spec.proto +98 -0
- data/ext/cld3/text_processing.cc +245 -0
- data/ext/cld3/text_processing.h +30 -0
- data/ext/cld3/unicodetext.cc +96 -0
- data/ext/cld3/unicodetext.h +144 -0
- data/ext/cld3/utf8acceptinterchange.h +486 -0
- data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/utf8statetable.cc +1344 -0
- data/ext/cld3/utf8statetable.h +285 -0
- data/ext/cld3/utils.cc +241 -0
- data/ext/cld3/utils.h +144 -0
- data/ext/cld3/workspace.cc +64 -0
- data/ext/cld3/workspace.h +177 -0
- data/lib/cld3.rb +99 -0
- metadata +158 -0
@@ -0,0 +1,89 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#include "relevant_script_feature.h"
|
17
|
+
|
18
|
+
#include <ctype.h>
|
19
|
+
|
20
|
+
#include <string>
|
21
|
+
|
22
|
+
#include "feature_extractor.h"
|
23
|
+
#include "feature_types.h"
|
24
|
+
#include "language_identifier_features.h"
|
25
|
+
#include "script_detector.h"
|
26
|
+
#include "cld_3/protos/sentence.pb.h"
|
27
|
+
#include "sentence_features.h"
|
28
|
+
#include "task_context.h"
|
29
|
+
#include "utils.h"
|
30
|
+
#include "workspace.h"
|
31
|
+
|
32
|
+
namespace chrome_lang_id {
|
33
|
+
void RelevantScriptFeature::Setup(TaskContext *context) {
|
34
|
+
// Nothing.
|
35
|
+
}
|
36
|
+
|
37
|
+
void RelevantScriptFeature::Init(TaskContext *context) {
|
38
|
+
set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
|
39
|
+
}
|
40
|
+
|
41
|
+
void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
|
42
|
+
const Sentence &sentence,
|
43
|
+
FeatureVector *result) const {
|
44
|
+
const string &text = sentence.text();
|
45
|
+
|
46
|
+
// We expect kNumRelevantScripts to be small, so we stack-allocate the array
|
47
|
+
// of counts. Still, if that changes, we want to find out.
|
48
|
+
static_assert(
|
49
|
+
kNumRelevantScripts < 25,
|
50
|
+
"switch counts to vector<int>: too big for stack-allocated int[]");
|
51
|
+
|
52
|
+
// counts[s] is the number of characters with script s.
|
53
|
+
// Note: {} "value-initializes" the array to zero.
|
54
|
+
int counts[kNumRelevantScripts]{};
|
55
|
+
int total_count = 0;
|
56
|
+
const char *const text_end = text.data() + text.size();
|
57
|
+
for (const char *curr = text.data(); curr < text_end;
|
58
|
+
curr += utils::OneCharLen(curr)) {
|
59
|
+
const int num_bytes = utils::OneCharLen(curr);
|
60
|
+
|
61
|
+
// If a partial UTF-8 character is encountered, break out of the loop.
|
62
|
+
if (curr + num_bytes > text_end) {
|
63
|
+
break;
|
64
|
+
}
|
65
|
+
|
66
|
+
// Skip spaces, numbers, punctuation, and all other non-alpha ASCII
|
67
|
+
// characters: these characters are used in so many languages, they do not
|
68
|
+
// communicate language-related information.
|
69
|
+
if ((num_bytes == 1) && !isalpha(*curr)) {
|
70
|
+
continue;
|
71
|
+
}
|
72
|
+
Script script = GetScript(curr, num_bytes);
|
73
|
+
CLD3_DCHECK(script >= 0);
|
74
|
+
CLD3_DCHECK(script < kNumRelevantScripts);
|
75
|
+
counts[static_cast<int>(script)]++;
|
76
|
+
total_count++;
|
77
|
+
}
|
78
|
+
|
79
|
+
for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
|
80
|
+
int count = counts[script_id];
|
81
|
+
if (count > 0) {
|
82
|
+
const float weight = static_cast<float>(count) / total_count;
|
83
|
+
FloatFeatureValue value(script_id, weight);
|
84
|
+
result->add(feature_type(), value.discrete_value);
|
85
|
+
}
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
} // namespace chrome_lang_id
|
@@ -0,0 +1,49 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#ifndef RELEVANT_SCRIPT_FEATURE_H_
|
17
|
+
#define RELEVANT_SCRIPT_FEATURE_H_
|
18
|
+
|
19
|
+
#include "feature_extractor.h"
|
20
|
+
#include "cld_3/protos/sentence.pb.h"
|
21
|
+
#include "sentence_features.h"
|
22
|
+
#include "task_context.h"
|
23
|
+
#include "workspace.h"
|
24
|
+
|
25
|
+
namespace chrome_lang_id {
|
26
|
+
|
27
|
+
// Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode
|
28
|
+
// script (see below): each such feature indicates the script and the ratio of
|
29
|
+
// UTF8 characters in that script, in the given sentence.
|
30
|
+
//
|
31
|
+
// What is a relevant script? Recognizing all 100+ Unicode scripts would
|
32
|
+
// require too much code size and runtime. Instead, we focus only on a few
|
33
|
+
// scripts that communicate a lot of language information: e.g., the use of
|
34
|
+
// Hiragana characters almost always indicates Japanese, so Hiragana is a
|
35
|
+
// "relevant" script for us. The Latin script is used by dozens of language, so
|
36
|
+
// Latin is not relevant in this context.
|
37
|
+
class RelevantScriptFeature : public WholeSentenceFeature {
|
38
|
+
public:
|
39
|
+
void Setup(TaskContext *context) override;
|
40
|
+
void Init(TaskContext *context) override;
|
41
|
+
|
42
|
+
// Appends the features computed from the sentence to the feature vector.
|
43
|
+
void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
|
44
|
+
FeatureVector *result) const override;
|
45
|
+
};
|
46
|
+
|
47
|
+
} // namespace chrome_lang_id
|
48
|
+
|
49
|
+
#endif // RELEVANT_SCRIPT_FEATURE_H_
|
@@ -0,0 +1,156 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#ifndef SCRIPT_DETECTOR_H_
|
17
|
+
#define SCRIPT_DETECTOR_H_
|
18
|
+
|
19
|
+
namespace chrome_lang_id {
|
20
|
+
|
21
|
+
// Unicode scripts we care about. To get compact and fast code, we detect only
|
22
|
+
// a few Unicode scripts that offer a strong indication about the language of
|
23
|
+
// the text (e.g., Hiragana -> Japanese).
|
24
|
+
enum Script {
|
25
|
+
// Special value to indicate internal errors in the script detection code.
|
26
|
+
kScriptError,
|
27
|
+
|
28
|
+
// Special values for all Unicode scripts that we do not detect. One special
|
29
|
+
// value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
|
30
|
+
// already have that information, we use it). kScriptOtherUtf8OneByte means
|
31
|
+
// ~Latin and kScriptOtherUtf8FourBytes means ~Han.
|
32
|
+
kScriptOtherUtf8OneByte,
|
33
|
+
kScriptOtherUtf8TwoBytes,
|
34
|
+
kScriptOtherUtf8ThreeBytes,
|
35
|
+
kScriptOtherUtf8FourBytes,
|
36
|
+
|
37
|
+
kScriptGreek,
|
38
|
+
kScriptCyrillic,
|
39
|
+
kScriptHebrew,
|
40
|
+
kScriptArabic,
|
41
|
+
kScriptHangulJamo, // Used primarily for Korean.
|
42
|
+
kScriptHiragana, // Used primarily for Japanese.
|
43
|
+
kScriptKatakana, // Used primarily for Japanese.
|
44
|
+
|
45
|
+
// Add new scripts here.
|
46
|
+
|
47
|
+
// Do not add any script after kNumRelevantScripts. This value indicates the
|
48
|
+
// number of elements in this enum Script (except this value) such that we can
|
49
|
+
// easily iterate over the scripts.
|
50
|
+
kNumRelevantScripts,
|
51
|
+
};
|
52
|
+
|
53
|
+
template <typename IntType>
|
54
|
+
inline bool InRange(IntType value, IntType low, IntType hi) {
|
55
|
+
return (value >= low) && (value <= hi);
|
56
|
+
}
|
57
|
+
|
58
|
+
// Returns Script for the UTF8 character that starts at address p.
|
59
|
+
// Precondition: p points to a valid UTF8 character of num_bytes bytes.
|
60
|
+
inline Script GetScript(const unsigned char *p, int num_bytes) {
|
61
|
+
switch (num_bytes) {
|
62
|
+
case 1:
|
63
|
+
return kScriptOtherUtf8OneByte;
|
64
|
+
|
65
|
+
case 2: {
|
66
|
+
// 2-byte UTF8 characters have 11 bits of information. unsigned int has
|
67
|
+
// at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
|
68
|
+
// it's enough. It's also usually the fastest int type on the current
|
69
|
+
// CPU, so it's better to use than int32.
|
70
|
+
static const unsigned int kGreekStart = 0x370;
|
71
|
+
|
72
|
+
// Commented out (unsued in the code): kGreekEnd = 0x3FF;
|
73
|
+
static const unsigned int kCyrillicStart = 0x400;
|
74
|
+
static const unsigned int kCyrillicEnd = 0x4FF;
|
75
|
+
static const unsigned int kHebrewStart = 0x590;
|
76
|
+
|
77
|
+
// Commented out (unsued in the code): kHebrewEnd = 0x5FF;
|
78
|
+
static const unsigned int kArabicStart = 0x600;
|
79
|
+
static const unsigned int kArabicEnd = 0x6FF;
|
80
|
+
const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
|
81
|
+
if (codepoint > kCyrillicEnd) {
|
82
|
+
if (codepoint >= kArabicStart) {
|
83
|
+
if (codepoint <= kArabicEnd) {
|
84
|
+
return kScriptArabic;
|
85
|
+
}
|
86
|
+
} else {
|
87
|
+
// At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
|
88
|
+
// codepoint <= kHebrewEnd.
|
89
|
+
if (codepoint >= kHebrewStart) {
|
90
|
+
return kScriptHebrew;
|
91
|
+
}
|
92
|
+
}
|
93
|
+
} else {
|
94
|
+
if (codepoint >= kCyrillicStart) {
|
95
|
+
return kScriptCyrillic;
|
96
|
+
} else {
|
97
|
+
// At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
|
98
|
+
// codepoint <= kGreekEnd.
|
99
|
+
if (codepoint >= kGreekStart) {
|
100
|
+
return kScriptGreek;
|
101
|
+
}
|
102
|
+
}
|
103
|
+
}
|
104
|
+
return kScriptOtherUtf8TwoBytes;
|
105
|
+
}
|
106
|
+
|
107
|
+
case 3: {
|
108
|
+
// 3-byte UTF8 characters have 16 bits of information. unsigned int has
|
109
|
+
// at least 16 bits.
|
110
|
+
static const unsigned int kHangulJamoStart = 0x1100;
|
111
|
+
static const unsigned int kHangulJamoEnd = 0x11FF;
|
112
|
+
static const unsigned int kHiraganaStart = 0x3041;
|
113
|
+
static const unsigned int kHiraganaEnd = 0x309F;
|
114
|
+
|
115
|
+
// Commented out (unsued in the code): kKatakanaStart = 0x30A0;
|
116
|
+
static const unsigned int kKatakanaEnd = 0x30FF;
|
117
|
+
const unsigned int codepoint =
|
118
|
+
((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
|
119
|
+
if (codepoint > kHiraganaEnd) {
|
120
|
+
// On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
|
121
|
+
// codepoint >= kKatakanaStart.
|
122
|
+
if (codepoint <= kKatakanaEnd) {
|
123
|
+
return kScriptKatakana;
|
124
|
+
}
|
125
|
+
} else {
|
126
|
+
if (codepoint >= kHiraganaStart) {
|
127
|
+
return kScriptHiragana;
|
128
|
+
} else {
|
129
|
+
if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
|
130
|
+
return kScriptHangulJamo;
|
131
|
+
}
|
132
|
+
}
|
133
|
+
}
|
134
|
+
return kScriptOtherUtf8ThreeBytes;
|
135
|
+
}
|
136
|
+
|
137
|
+
case 4:
|
138
|
+
return kScriptOtherUtf8FourBytes;
|
139
|
+
|
140
|
+
default:
|
141
|
+
return kScriptError;
|
142
|
+
}
|
143
|
+
}
|
144
|
+
|
145
|
+
// Returns Script for the UTF8 character that starts at address p. Similar to
|
146
|
+
// the previous version of GetScript, except for "char" vs "unsigned char".
|
147
|
+
// Most code works with "char *" pointers, ignoring the fact that char is
|
148
|
+
// unsigned (by default) on most platforms, but signed on iOS. This code takes
|
149
|
+
// care of making sure we always treat chars as unsigned.
|
150
|
+
inline Script GetScript(const char *p, int num_bytes) {
|
151
|
+
return GetScript(reinterpret_cast<const unsigned char *>(p), num_bytes);
|
152
|
+
}
|
153
|
+
|
154
|
+
} // namespace chrome_lang_id
|
155
|
+
|
156
|
+
#endif // SCRIPT_DETECTOR_H_
|
@@ -0,0 +1,77 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
// Protocol buffer specification for sentence analysis.
|
17
|
+
|
18
|
+
syntax = "proto2";
|
19
|
+
option optimize_for = LITE_RUNTIME;
|
20
|
+
|
21
|
+
package chrome_lang_id;
|
22
|
+
|
23
|
+
// A Sentence contains the raw text contents of a sentence, as well as an
|
24
|
+
// analysis.
|
25
|
+
message Sentence {
|
26
|
+
// Identifier for sentence.
|
27
|
+
optional string id = 1;
|
28
|
+
|
29
|
+
// Raw text contents of the sentence.
|
30
|
+
optional string text = 2;
|
31
|
+
|
32
|
+
// Tokenization of the sentence.
|
33
|
+
repeated Token token = 3;
|
34
|
+
|
35
|
+
extensions 1000 to max;
|
36
|
+
}
|
37
|
+
|
38
|
+
// A sentence token marks a span of bytes in the sentence text as a token
|
39
|
+
// or word.
|
40
|
+
message Token {
|
41
|
+
// Token word form.
|
42
|
+
required string word = 1;
|
43
|
+
|
44
|
+
// Start position of token in text.
|
45
|
+
required int32 start = 2;
|
46
|
+
|
47
|
+
// End position of token in text. Gives index of last byte, not one past
|
48
|
+
// the last byte. If token came from lexer, excludes any trailing HTML tags.
|
49
|
+
required int32 end = 3;
|
50
|
+
|
51
|
+
// Head of this token in the dependency tree: the id of the token which has an
|
52
|
+
// arc going to this one. If it is the root token of a sentence, then it is
|
53
|
+
// set to -1.
|
54
|
+
optional int32 head = 4 [default = -1];
|
55
|
+
|
56
|
+
// Part-of-speech tag for token.
|
57
|
+
optional string tag = 5;
|
58
|
+
|
59
|
+
// Coarse-grained word category for token.
|
60
|
+
optional string category = 6;
|
61
|
+
|
62
|
+
// Label for dependency relation between this token and its head.
|
63
|
+
optional string label = 7;
|
64
|
+
|
65
|
+
// Break level for tokens that indicates how it was separated from the
|
66
|
+
// previous token in the text.
|
67
|
+
enum BreakLevel {
|
68
|
+
NO_BREAK = 0; // No separation between tokens.
|
69
|
+
SPACE_BREAK = 1; // Tokens separated by space.
|
70
|
+
LINE_BREAK = 2; // Tokens separated by line break.
|
71
|
+
SENTENCE_BREAK = 3; // Tokens separated by sentence break.
|
72
|
+
}
|
73
|
+
|
74
|
+
optional BreakLevel break_level = 8 [default = SPACE_BREAK];
|
75
|
+
|
76
|
+
extensions 1000 to max;
|
77
|
+
}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#include "sentence_features.h"
|
17
|
+
|
18
|
+
#include "registry.h"
|
19
|
+
|
20
|
+
namespace chrome_lang_id {
|
21
|
+
|
22
|
+
// Declare registry for the whole Sentence feature functions. NOTE: this is not
|
23
|
+
// yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
|
24
|
+
// constructor, *before* we use any feature.
|
25
|
+
template <>
|
26
|
+
WholeSentenceFeature::Registry
|
27
|
+
*RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
|
28
|
+
|
29
|
+
} // namespace chrome_lang_id
|
@@ -0,0 +1,35 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
// Features that operate on Sentence objects. Most features are defined
|
17
|
+
// in this header so they may be re-used via composition into other more
|
18
|
+
// advanced feature classes.
|
19
|
+
|
20
|
+
#ifndef SENTENCE_FEATURES_H_
|
21
|
+
#define SENTENCE_FEATURES_H_
|
22
|
+
|
23
|
+
#include "feature_extractor.h"
|
24
|
+
#include "cld_3/protos/sentence.pb.h"
|
25
|
+
|
26
|
+
namespace chrome_lang_id {
|
27
|
+
|
28
|
+
// Feature function that extracts features for the full Sentence.
|
29
|
+
typedef FeatureFunction<Sentence> WholeSentenceFeature;
|
30
|
+
|
31
|
+
typedef FeatureExtractor<Sentence> WholeSentenceExtractor;
|
32
|
+
|
33
|
+
} // namespace chrome_lang_id
|
34
|
+
|
35
|
+
#endif // SENTENCE_FEATURES_H_
|
@@ -0,0 +1,72 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#ifndef SIMPLE_ADDER_H_
|
17
|
+
#define SIMPLE_ADDER_H_
|
18
|
+
|
19
|
+
#include "base.h"
|
20
|
+
|
21
|
+
namespace chrome_lang_id {
|
22
|
+
|
23
|
+
// Class for adding (possibly) scaled arrays.
|
24
|
+
class SimpleAdder {
|
25
|
+
public:
|
26
|
+
static constexpr const int kNumFloatsPerBatch = 1;
|
27
|
+
|
28
|
+
CLD3_ATTRIBUTE_ALWAYS_INLINE SimpleAdder(float *dest, int num_floats)
|
29
|
+
: dest_(dest), num_floats_(num_floats) {}
|
30
|
+
|
31
|
+
CLD3_ATTRIBUTE_ALWAYS_INLINE ~SimpleAdder() {
|
32
|
+
// Should call Finalize function before destruction.
|
33
|
+
CLD3_DCHECK(dest_ == nullptr);
|
34
|
+
}
|
35
|
+
|
36
|
+
// Caller must call this function before calling deconstruct this object.
|
37
|
+
CLD3_ATTRIBUTE_ALWAYS_INLINE void Finalize() { dest_ = nullptr; }
|
38
|
+
|
39
|
+
CLD3_ATTRIBUTE_ALWAYS_INLINE void LazyAdd(const float *source) const {
|
40
|
+
AddImpl(source, num_floats_, dest_);
|
41
|
+
}
|
42
|
+
|
43
|
+
CLD3_ATTRIBUTE_ALWAYS_INLINE void LazyScaleAdd(const float *source,
|
44
|
+
const float scale) const {
|
45
|
+
ScaleAddImpl(source, num_floats_, scale, dest_);
|
46
|
+
}
|
47
|
+
|
48
|
+
// Simple fast while loop to implement dest += source.
|
49
|
+
CLD3_ATTRIBUTE_ALWAYS_INLINE static void AddImpl(
|
50
|
+
const float *__restrict source, uint32 size, float *__restrict dest) {
|
51
|
+
for (uint32 i = 0; i < size; ++i) {
|
52
|
+
dest[i] += source[i];
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
// Simple fast while loop to implement dest += scale * source.
|
57
|
+
CLD3_ATTRIBUTE_ALWAYS_INLINE static void ScaleAddImpl(
|
58
|
+
const float *__restrict source, uint32 size, const float scale,
|
59
|
+
float *__restrict dest) {
|
60
|
+
for (uint32 i = 0; i < size; ++i) {
|
61
|
+
dest[i] += source[i] * scale;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
private:
|
66
|
+
float *dest_;
|
67
|
+
int num_floats_;
|
68
|
+
};
|
69
|
+
|
70
|
+
} // namespace chrome_lang_id
|
71
|
+
|
72
|
+
#endif // SIMPLE_ADDER_H_
|