cld3 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/LICENSE +204 -0
- data/LICENSE_CLD3 +203 -0
- data/README.md +22 -0
- data/cld3.gemspec +35 -0
- data/ext/cld3/base.cc +36 -0
- data/ext/cld3/base.h +106 -0
- data/ext/cld3/casts.h +98 -0
- data/ext/cld3/embedding_feature_extractor.cc +51 -0
- data/ext/cld3/embedding_feature_extractor.h +182 -0
- data/ext/cld3/embedding_network.cc +196 -0
- data/ext/cld3/embedding_network.h +186 -0
- data/ext/cld3/embedding_network_params.h +285 -0
- data/ext/cld3/extconf.rb +49 -0
- data/ext/cld3/feature_extractor.cc +137 -0
- data/ext/cld3/feature_extractor.h +633 -0
- data/ext/cld3/feature_extractor.proto +50 -0
- data/ext/cld3/feature_types.cc +72 -0
- data/ext/cld3/feature_types.h +158 -0
- data/ext/cld3/fixunicodevalue.cc +55 -0
- data/ext/cld3/fixunicodevalue.h +69 -0
- data/ext/cld3/float16.h +58 -0
- data/ext/cld3/fml_parser.cc +308 -0
- data/ext/cld3/fml_parser.h +123 -0
- data/ext/cld3/generated_entities.cc +296 -0
- data/ext/cld3/generated_ulscript.cc +678 -0
- data/ext/cld3/generated_ulscript.h +142 -0
- data/ext/cld3/getonescriptspan.cc +1109 -0
- data/ext/cld3/getonescriptspan.h +124 -0
- data/ext/cld3/integral_types.h +37 -0
- data/ext/cld3/lang_id_nn_params.cc +57449 -0
- data/ext/cld3/lang_id_nn_params.h +178 -0
- data/ext/cld3/language_identifier_features.cc +165 -0
- data/ext/cld3/language_identifier_features.h +116 -0
- data/ext/cld3/nnet_language_identifier.cc +380 -0
- data/ext/cld3/nnet_language_identifier.h +175 -0
- data/ext/cld3/nnet_language_identifier_c.cc +72 -0
- data/ext/cld3/offsetmap.cc +478 -0
- data/ext/cld3/offsetmap.h +168 -0
- data/ext/cld3/port.h +143 -0
- data/ext/cld3/registry.cc +28 -0
- data/ext/cld3/registry.h +242 -0
- data/ext/cld3/relevant_script_feature.cc +89 -0
- data/ext/cld3/relevant_script_feature.h +49 -0
- data/ext/cld3/script_detector.h +156 -0
- data/ext/cld3/sentence.proto +77 -0
- data/ext/cld3/sentence_features.cc +29 -0
- data/ext/cld3/sentence_features.h +35 -0
- data/ext/cld3/simple_adder.h +72 -0
- data/ext/cld3/stringpiece.h +81 -0
- data/ext/cld3/task_context.cc +161 -0
- data/ext/cld3/task_context.h +81 -0
- data/ext/cld3/task_context_params.cc +74 -0
- data/ext/cld3/task_context_params.h +54 -0
- data/ext/cld3/task_spec.proto +98 -0
- data/ext/cld3/text_processing.cc +245 -0
- data/ext/cld3/text_processing.h +30 -0
- data/ext/cld3/unicodetext.cc +96 -0
- data/ext/cld3/unicodetext.h +144 -0
- data/ext/cld3/utf8acceptinterchange.h +486 -0
- data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/utf8statetable.cc +1344 -0
- data/ext/cld3/utf8statetable.h +285 -0
- data/ext/cld3/utils.cc +241 -0
- data/ext/cld3/utils.h +144 -0
- data/ext/cld3/workspace.cc +64 -0
- data/ext/cld3/workspace.h +177 -0
- data/lib/cld3.rb +99 -0
- metadata +158 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
// Protocol buffers for feature extractor.
|
17
|
+
|
18
|
+
syntax = "proto2";
|
19
|
+
option optimize_for = LITE_RUNTIME;
|
20
|
+
|
21
|
+
package chrome_lang_id;
|
22
|
+
|
23
|
+
message Parameter {
|
24
|
+
optional string name = 1;
|
25
|
+
optional string value = 2;
|
26
|
+
}
|
27
|
+
|
28
|
+
// Descriptor for feature function.
|
29
|
+
message FeatureFunctionDescriptor {
|
30
|
+
// Feature function type.
|
31
|
+
required string type = 1;
|
32
|
+
|
33
|
+
// Feature function name.
|
34
|
+
optional string name = 2;
|
35
|
+
|
36
|
+
// Default argument for feature function.
|
37
|
+
optional int32 argument = 3 [default = 0];
|
38
|
+
|
39
|
+
// Named parameters for feature descriptor.
|
40
|
+
repeated Parameter parameter = 4;
|
41
|
+
|
42
|
+
// Nested sub-feature function descriptors.
|
43
|
+
repeated FeatureFunctionDescriptor feature = 7;
|
44
|
+
};
|
45
|
+
|
46
|
+
// Descriptor for feature extractor.
|
47
|
+
message FeatureExtractorDescriptor {
|
48
|
+
// Top-level feature function for extractor.
|
49
|
+
repeated FeatureFunctionDescriptor feature = 1;
|
50
|
+
};
|
@@ -0,0 +1,72 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#include "feature_types.h"
|
17
|
+
|
18
|
+
#include <algorithm>
|
19
|
+
#include <map>
|
20
|
+
#include <string>
|
21
|
+
#include <utility>
|
22
|
+
|
23
|
+
#include "base.h"
|
24
|
+
|
25
|
+
namespace chrome_lang_id {
|
26
|
+
|
27
|
+
FeatureType::FeatureType(const string &name)
|
28
|
+
: name_(name),
|
29
|
+
base_(0),
|
30
|
+
is_continuous_(name.find("continuous") != string::npos) {}
|
31
|
+
|
32
|
+
FeatureType::~FeatureType() {}
|
33
|
+
|
34
|
+
template <class Resource>
|
35
|
+
ResourceBasedFeatureType<Resource>::ResourceBasedFeatureType(
|
36
|
+
const string &name, const Resource *resource,
|
37
|
+
const std::map<FeatureValue, string> &values)
|
38
|
+
: FeatureType(name), resource_(resource), values_(values) {
|
39
|
+
max_value_ = resource->NumValues() - 1;
|
40
|
+
for (const auto &pair : values) {
|
41
|
+
CLD3_DCHECK(pair.first >= resource->NumValues());
|
42
|
+
max_value_ = pair.first > max_value_ ? pair.first : max_value_;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
template <class Resource>
|
47
|
+
ResourceBasedFeatureType<Resource>::ResourceBasedFeatureType(
|
48
|
+
const string &name, const Resource *resource)
|
49
|
+
: ResourceBasedFeatureType(name, resource, {}) {}
|
50
|
+
|
51
|
+
EnumFeatureType::EnumFeatureType(
|
52
|
+
const string &name, const std::map<FeatureValue, string> &value_names)
|
53
|
+
: FeatureType(name), value_names_(value_names) {
|
54
|
+
for (const auto &pair : value_names) {
|
55
|
+
CLD3_DCHECK(pair.first >= 0);
|
56
|
+
domain_size_ = std::max(domain_size_, pair.first + 1);
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
EnumFeatureType::~EnumFeatureType() {}
|
61
|
+
|
62
|
+
string EnumFeatureType::GetFeatureValueName(FeatureValue value) const {
|
63
|
+
auto it = value_names_.find(value);
|
64
|
+
if (it == value_names_.end()) {
|
65
|
+
return "<INVALID>";
|
66
|
+
}
|
67
|
+
return it->second;
|
68
|
+
}
|
69
|
+
|
70
|
+
FeatureValue EnumFeatureType::GetDomainSize() const { return domain_size_; }
|
71
|
+
|
72
|
+
} // namespace chrome_lang_id
|
@@ -0,0 +1,158 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
// Common feature types for parser components.
|
17
|
+
|
18
|
+
#ifndef FEATURE_TYPES_H_
|
19
|
+
#define FEATURE_TYPES_H_
|
20
|
+
|
21
|
+
#include <algorithm>
|
22
|
+
#include <map>
|
23
|
+
#include <string>
|
24
|
+
#include <utility>
|
25
|
+
|
26
|
+
#include "base.h"
|
27
|
+
|
28
|
+
namespace chrome_lang_id {
|
29
|
+
|
30
|
+
// TODO(djweiss) Clean this up as well.
|
31
|
+
// Use the same type for feature values as is used for predicated.
|
32
|
+
typedef int64 Predicate;
|
33
|
+
typedef Predicate FeatureValue;
|
34
|
+
|
35
|
+
// Each feature value in a feature vector has a feature type. The feature type
|
36
|
+
// is used for converting feature type and value pairs to predicate values. The
|
37
|
+
// feature type can also return names for feature values and calculate the size
|
38
|
+
// of the feature value domain. The FeatureType class is abstract and must be
|
39
|
+
// specialized for the concrete feature types.
|
40
|
+
class FeatureType {
|
41
|
+
public:
|
42
|
+
// Initializes a feature type.
|
43
|
+
explicit FeatureType(const string &name);
|
44
|
+
|
45
|
+
virtual ~FeatureType();
|
46
|
+
|
47
|
+
// Converts a feature value to a name.
|
48
|
+
virtual string GetFeatureValueName(FeatureValue value) const = 0;
|
49
|
+
|
50
|
+
// Returns the size of the feature values domain.
|
51
|
+
virtual int64 GetDomainSize() const = 0;
|
52
|
+
|
53
|
+
// Returns the feature type name.
|
54
|
+
const string &name() const { return name_; }
|
55
|
+
|
56
|
+
Predicate base() const { return base_; }
|
57
|
+
void set_base(Predicate base) { base_ = base; }
|
58
|
+
|
59
|
+
// Returns true iff this feature is continuous; see FloatFeatureValue.
|
60
|
+
bool is_continuous() const { return is_continuous_; }
|
61
|
+
|
62
|
+
private:
|
63
|
+
// Feature type name.
|
64
|
+
string name_;
|
65
|
+
|
66
|
+
// "Base" feature value: i.e. a "slot" in a global ordering of features.
|
67
|
+
Predicate base_;
|
68
|
+
|
69
|
+
// See doc for is_continuous().
|
70
|
+
bool is_continuous_;
|
71
|
+
};
|
72
|
+
|
73
|
+
// Templated generic resource based feature type. This feature type delegates
|
74
|
+
// look up of feature value names to an unknown resource class, which is not
|
75
|
+
// owned. Optionally, this type can also store a mapping of extra values which
|
76
|
+
// are not in the resource.
|
77
|
+
//
|
78
|
+
// Note: this class assumes that Resource->GetFeatureValueName() will return
|
79
|
+
// successfully for values ONLY in the range [0, Resource->NumValues()) Any
|
80
|
+
// feature value not in the extra value map and not in the above range of
|
81
|
+
// Resource will result in a ERROR and return of "<INVALID>".
|
82
|
+
template <class Resource>
|
83
|
+
class ResourceBasedFeatureType : public FeatureType {
|
84
|
+
public:
|
85
|
+
// Creates a new type with given name, resource object, and a mapping of
|
86
|
+
// special values. The values must be greater or equal to
|
87
|
+
// resource->NumValues() so as to avoid collisions; this is verified with
|
88
|
+
// CHECK at creation.
|
89
|
+
ResourceBasedFeatureType(const string &name, const Resource *resource,
|
90
|
+
const std::map<FeatureValue, string> &values);
|
91
|
+
|
92
|
+
// Creates a new type with no special values.
|
93
|
+
ResourceBasedFeatureType(const string &name, const Resource *resource);
|
94
|
+
|
95
|
+
// Returns the feature name for a given feature value. First checks the values
|
96
|
+
// map, then checks the resource to look up the name.
|
97
|
+
string GetFeatureValueName(FeatureValue value) const override {
|
98
|
+
if (values_.find(value) != values_.end()) {
|
99
|
+
return values_.find(value)->second;
|
100
|
+
}
|
101
|
+
if (value >= 0 && value < resource_->NumValues()) {
|
102
|
+
return resource_->GetFeatureValueName(value);
|
103
|
+
} else {
|
104
|
+
return "<INVALID>";
|
105
|
+
}
|
106
|
+
}
|
107
|
+
|
108
|
+
// Returns the number of possible values for this feature type. This is the
|
109
|
+
// based on the largest value that was observed in the extra values.
|
110
|
+
FeatureValue GetDomainSize() const override { return max_value_ + 1; }
|
111
|
+
|
112
|
+
protected:
|
113
|
+
// Shared resource. Not owned.
|
114
|
+
const Resource *resource_ = nullptr;
|
115
|
+
|
116
|
+
// Maximum possible value this feature could take.
|
117
|
+
FeatureValue max_value_;
|
118
|
+
|
119
|
+
// Mapping for extra feature values not in the resource.
|
120
|
+
std::map<FeatureValue, string> values_;
|
121
|
+
};
|
122
|
+
|
123
|
+
// Feature type that is defined using an explicit map from FeatureValue to
|
124
|
+
// string values. This can reduce some of the boilerplate when defining
|
125
|
+
// features that generate enum values. Example usage:
|
126
|
+
//
|
127
|
+
// class BeverageSizeFeature : public FeatureFunction<Beverage>
|
128
|
+
// enum FeatureValue { SMALL, MEDIUM, LARGE }; // values for this feature
|
129
|
+
// void Init(TaskContext *context) override {
|
130
|
+
// set_feature_type(new EnumFeatureType("beverage_size",
|
131
|
+
// {{SMALL, "SMALL"}, {MEDIUM, "MEDIUM"}, {LARGE, "LARGE"}});
|
132
|
+
// }
|
133
|
+
// [...]
|
134
|
+
// };
|
135
|
+
class EnumFeatureType : public FeatureType {
|
136
|
+
public:
|
137
|
+
EnumFeatureType(const string &name,
|
138
|
+
const std::map<FeatureValue, string> &value_names);
|
139
|
+
~EnumFeatureType() override;
|
140
|
+
|
141
|
+
// Returns the feature name for a given feature value.
|
142
|
+
string GetFeatureValueName(FeatureValue value) const override;
|
143
|
+
|
144
|
+
// Returns the number of possible values for this feature type. This is one
|
145
|
+
// greater than the largest value in the value_names map.
|
146
|
+
FeatureValue GetDomainSize() const override;
|
147
|
+
|
148
|
+
protected:
|
149
|
+
// Maximum possible value this feature could take.
|
150
|
+
FeatureValue domain_size_ = 0;
|
151
|
+
|
152
|
+
// Names of feature values.
|
153
|
+
std::map<FeatureValue, string> value_names_;
|
154
|
+
};
|
155
|
+
|
156
|
+
} // namespace chrome_lang_id
|
157
|
+
|
158
|
+
#endif // FEATURE_TYPES_H_
|
@@ -0,0 +1,55 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Routine that maps a Unicode code point to an interchange-valid one
|
17
|
+
//
|
18
|
+
|
19
|
+
#include "fixunicodevalue.h"
|
20
|
+
#include "integral_types.h"
|
21
|
+
|
22
|
+
namespace chrome_lang_id {
|
23
|
+
namespace CLD2 {
|
24
|
+
|
25
|
+
// Guarantees that the resulting output value is interchange valid
|
26
|
+
// 00-FF; map to spaces or MS CP1252
|
27
|
+
// D800-DFFF; surrogates
|
28
|
+
// FDD0-FDEF; non-characters
|
29
|
+
// xxFFFE-xxFFFF; non-characters
|
30
|
+
char32 FixUnicodeValue(char32 uv) {
|
31
|
+
uint32 uuv = static_cast<uint32>(uv);
|
32
|
+
if (uuv < 0x0100) {
|
33
|
+
return kMapFullMicrosoft1252OrSpace[uuv];
|
34
|
+
}
|
35
|
+
if (uuv < 0xD800) {
|
36
|
+
return uv;
|
37
|
+
}
|
38
|
+
if ((uuv & ~0x0F) == 0xFDD0) { // non-characters
|
39
|
+
return 0xFFFD;
|
40
|
+
}
|
41
|
+
if ((uuv & ~0x0F) == 0xFDE0) { // non-characters
|
42
|
+
return 0xFFFD;
|
43
|
+
}
|
44
|
+
if ((uuv & 0x00FFFE) == 0xFFFE) { // non-characters
|
45
|
+
return 0xFFFD;
|
46
|
+
}
|
47
|
+
if ((0xE000 <= uuv) && (uuv <= 0x10FFFF)) {
|
48
|
+
return uv;
|
49
|
+
}
|
50
|
+
// surrogates and negative and > 0x10FFFF all land here
|
51
|
+
return 0xFFFD;
|
52
|
+
}
|
53
|
+
|
54
|
+
} // End namespace CLD2
|
55
|
+
} // End namespace chrome_lang_id
|
@@ -0,0 +1,69 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Routine that maps a Unicode code point to an interchange-valid one
|
17
|
+
//
|
18
|
+
// Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
|
19
|
+
// code points. C0 and C1 control codes that are not interchange-valid
|
20
|
+
// are mapped to spaces.
|
21
|
+
|
22
|
+
|
23
|
+
#ifndef SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
24
|
+
#define SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
25
|
+
|
26
|
+
#include "integral_types.h" // for char32
|
27
|
+
#include "port.h"
|
28
|
+
|
29
|
+
namespace chrome_lang_id {
|
30
|
+
namespace CLD2 {
|
31
|
+
|
32
|
+
// Map byte value 0000-00FF to char32
|
33
|
+
// Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
|
34
|
+
// Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
|
35
|
+
static const char32 kMapFullMicrosoft1252OrSpace[256] = {
|
36
|
+
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
|
37
|
+
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
|
38
|
+
0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
|
39
|
+
0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
|
40
|
+
|
41
|
+
0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
|
42
|
+
0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
|
43
|
+
0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
|
44
|
+
0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
|
45
|
+
|
46
|
+
0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
|
47
|
+
0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
|
48
|
+
0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
|
49
|
+
0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
|
50
|
+
0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
|
51
|
+
0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
|
52
|
+
|
53
|
+
0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
|
54
|
+
0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
|
55
|
+
0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
|
56
|
+
0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
|
57
|
+
};
|
58
|
+
|
59
|
+
// Guarantees that the resulting output value is interchange valid
|
60
|
+
// 00-FF; map to spaces or MS CP1252
|
61
|
+
// D800-DFFF; surrogates
|
62
|
+
// FDD0-FDEF; non-characters
|
63
|
+
// xxFFFE-xxFFFF; non-characters
|
64
|
+
char32 FixUnicodeValue(char32 uv);
|
65
|
+
|
66
|
+
} // End namespace CLD2
|
67
|
+
} // End namespace chrome_lang_id
|
68
|
+
|
69
|
+
#endif // SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
data/ext/cld3/float16.h
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#ifndef FLOAT16_H_
|
17
|
+
#define FLOAT16_H_
|
18
|
+
|
19
|
+
#include <string.h> // for memcpy
|
20
|
+
|
21
|
+
#include "base.h"
|
22
|
+
#include "casts.h"
|
23
|
+
|
24
|
+
namespace chrome_lang_id {
|
25
|
+
|
26
|
+
// Compact 16-bit encoding of floating point numbers. This
|
27
|
+
// representation uses 1 bit for the sign, 8 bits for the exponent and
|
28
|
+
// 7 bits for the mantissa. It is assumed that floats are in IEEE 754
|
29
|
+
// format so a float16 is just bits 16-31 of a single precision float.
|
30
|
+
//
|
31
|
+
// NOTE: The IEEE floating point standard defines a float16 format that
|
32
|
+
// is different than this format (it has fewer bits of exponent and more
|
33
|
+
// bits of mantissa). We don't use that format here because conversion
|
34
|
+
// to/from 32-bit floats is more complex for that format, and the
|
35
|
+
// conversion for this format is very simple.
|
36
|
+
//
|
37
|
+
// <---------float16------------>
|
38
|
+
// s e e e e e e e e f f f f f f f f f f f f f f f f f f f f f f f
|
39
|
+
// <------------------------------float-------------------------->
|
40
|
+
// 3 3 2 2 1 1 0
|
41
|
+
// 1 0 3 2 5 4 0
|
42
|
+
|
43
|
+
typedef uint16 float16;
|
44
|
+
|
45
|
+
static inline float16 Float32To16(float f) {
|
46
|
+
// Note that we just truncate the mantissa bits: we make no effort to
|
47
|
+
// do any smarter rounding.
|
48
|
+
return (lang_id_bit_cast<uint32>(f) >> 16) & 0xffff;
|
49
|
+
}
|
50
|
+
|
51
|
+
static inline float Float16To32(float16 f) {
|
52
|
+
// We fill in the new mantissa bits with 0, and don't do anything smarter.
|
53
|
+
return lang_id_bit_cast<float>(f << 16);
|
54
|
+
}
|
55
|
+
|
56
|
+
} // namespace chrome_lang_id
|
57
|
+
|
58
|
+
#endif // FLOAT16_H_
|