cld3 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/LICENSE +204 -0
- data/LICENSE_CLD3 +203 -0
- data/README.md +22 -0
- data/cld3.gemspec +35 -0
- data/ext/cld3/base.cc +36 -0
- data/ext/cld3/base.h +106 -0
- data/ext/cld3/casts.h +98 -0
- data/ext/cld3/embedding_feature_extractor.cc +51 -0
- data/ext/cld3/embedding_feature_extractor.h +182 -0
- data/ext/cld3/embedding_network.cc +196 -0
- data/ext/cld3/embedding_network.h +186 -0
- data/ext/cld3/embedding_network_params.h +285 -0
- data/ext/cld3/extconf.rb +49 -0
- data/ext/cld3/feature_extractor.cc +137 -0
- data/ext/cld3/feature_extractor.h +633 -0
- data/ext/cld3/feature_extractor.proto +50 -0
- data/ext/cld3/feature_types.cc +72 -0
- data/ext/cld3/feature_types.h +158 -0
- data/ext/cld3/fixunicodevalue.cc +55 -0
- data/ext/cld3/fixunicodevalue.h +69 -0
- data/ext/cld3/float16.h +58 -0
- data/ext/cld3/fml_parser.cc +308 -0
- data/ext/cld3/fml_parser.h +123 -0
- data/ext/cld3/generated_entities.cc +296 -0
- data/ext/cld3/generated_ulscript.cc +678 -0
- data/ext/cld3/generated_ulscript.h +142 -0
- data/ext/cld3/getonescriptspan.cc +1109 -0
- data/ext/cld3/getonescriptspan.h +124 -0
- data/ext/cld3/integral_types.h +37 -0
- data/ext/cld3/lang_id_nn_params.cc +57449 -0
- data/ext/cld3/lang_id_nn_params.h +178 -0
- data/ext/cld3/language_identifier_features.cc +165 -0
- data/ext/cld3/language_identifier_features.h +116 -0
- data/ext/cld3/nnet_language_identifier.cc +380 -0
- data/ext/cld3/nnet_language_identifier.h +175 -0
- data/ext/cld3/nnet_language_identifier_c.cc +72 -0
- data/ext/cld3/offsetmap.cc +478 -0
- data/ext/cld3/offsetmap.h +168 -0
- data/ext/cld3/port.h +143 -0
- data/ext/cld3/registry.cc +28 -0
- data/ext/cld3/registry.h +242 -0
- data/ext/cld3/relevant_script_feature.cc +89 -0
- data/ext/cld3/relevant_script_feature.h +49 -0
- data/ext/cld3/script_detector.h +156 -0
- data/ext/cld3/sentence.proto +77 -0
- data/ext/cld3/sentence_features.cc +29 -0
- data/ext/cld3/sentence_features.h +35 -0
- data/ext/cld3/simple_adder.h +72 -0
- data/ext/cld3/stringpiece.h +81 -0
- data/ext/cld3/task_context.cc +161 -0
- data/ext/cld3/task_context.h +81 -0
- data/ext/cld3/task_context_params.cc +74 -0
- data/ext/cld3/task_context_params.h +54 -0
- data/ext/cld3/task_spec.proto +98 -0
- data/ext/cld3/text_processing.cc +245 -0
- data/ext/cld3/text_processing.h +30 -0
- data/ext/cld3/unicodetext.cc +96 -0
- data/ext/cld3/unicodetext.h +144 -0
- data/ext/cld3/utf8acceptinterchange.h +486 -0
- data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/utf8statetable.cc +1344 -0
- data/ext/cld3/utf8statetable.h +285 -0
- data/ext/cld3/utils.cc +241 -0
- data/ext/cld3/utils.h +144 -0
- data/ext/cld3/workspace.cc +64 -0
- data/ext/cld3/workspace.h +177 -0
- data/lib/cld3.rb +99 -0
- metadata +158 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
// Protocol buffers for feature extractor.
|
17
|
+
|
18
|
+
syntax = "proto2";
|
19
|
+
option optimize_for = LITE_RUNTIME;
|
20
|
+
|
21
|
+
package chrome_lang_id;
|
22
|
+
|
23
|
+
message Parameter {
|
24
|
+
optional string name = 1;
|
25
|
+
optional string value = 2;
|
26
|
+
}
|
27
|
+
|
28
|
+
// Descriptor for feature function.
|
29
|
+
message FeatureFunctionDescriptor {
|
30
|
+
// Feature function type.
|
31
|
+
required string type = 1;
|
32
|
+
|
33
|
+
// Feature function name.
|
34
|
+
optional string name = 2;
|
35
|
+
|
36
|
+
// Default argument for feature function.
|
37
|
+
optional int32 argument = 3 [default = 0];
|
38
|
+
|
39
|
+
// Named parameters for feature descriptor.
|
40
|
+
repeated Parameter parameter = 4;
|
41
|
+
|
42
|
+
// Nested sub-feature function descriptors.
|
43
|
+
repeated FeatureFunctionDescriptor feature = 7;
|
44
|
+
};
|
45
|
+
|
46
|
+
// Descriptor for feature extractor.
|
47
|
+
message FeatureExtractorDescriptor {
|
48
|
+
// Top-level feature function for extractor.
|
49
|
+
repeated FeatureFunctionDescriptor feature = 1;
|
50
|
+
};
|
@@ -0,0 +1,72 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#include "feature_types.h"
|
17
|
+
|
18
|
+
#include <algorithm>
|
19
|
+
#include <map>
|
20
|
+
#include <string>
|
21
|
+
#include <utility>
|
22
|
+
|
23
|
+
#include "base.h"
|
24
|
+
|
25
|
+
namespace chrome_lang_id {
|
26
|
+
|
27
|
+
FeatureType::FeatureType(const string &name)
|
28
|
+
: name_(name),
|
29
|
+
base_(0),
|
30
|
+
is_continuous_(name.find("continuous") != string::npos) {}
|
31
|
+
|
32
|
+
FeatureType::~FeatureType() {}
|
33
|
+
|
34
|
+
template <class Resource>
|
35
|
+
ResourceBasedFeatureType<Resource>::ResourceBasedFeatureType(
|
36
|
+
const string &name, const Resource *resource,
|
37
|
+
const std::map<FeatureValue, string> &values)
|
38
|
+
: FeatureType(name), resource_(resource), values_(values) {
|
39
|
+
max_value_ = resource->NumValues() - 1;
|
40
|
+
for (const auto &pair : values) {
|
41
|
+
CLD3_DCHECK(pair.first >= resource->NumValues());
|
42
|
+
max_value_ = pair.first > max_value_ ? pair.first : max_value_;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
template <class Resource>
|
47
|
+
ResourceBasedFeatureType<Resource>::ResourceBasedFeatureType(
|
48
|
+
const string &name, const Resource *resource)
|
49
|
+
: ResourceBasedFeatureType(name, resource, {}) {}
|
50
|
+
|
51
|
+
EnumFeatureType::EnumFeatureType(
|
52
|
+
const string &name, const std::map<FeatureValue, string> &value_names)
|
53
|
+
: FeatureType(name), value_names_(value_names) {
|
54
|
+
for (const auto &pair : value_names) {
|
55
|
+
CLD3_DCHECK(pair.first >= 0);
|
56
|
+
domain_size_ = std::max(domain_size_, pair.first + 1);
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
EnumFeatureType::~EnumFeatureType() {}
|
61
|
+
|
62
|
+
string EnumFeatureType::GetFeatureValueName(FeatureValue value) const {
|
63
|
+
auto it = value_names_.find(value);
|
64
|
+
if (it == value_names_.end()) {
|
65
|
+
return "<INVALID>";
|
66
|
+
}
|
67
|
+
return it->second;
|
68
|
+
}
|
69
|
+
|
70
|
+
FeatureValue EnumFeatureType::GetDomainSize() const { return domain_size_; }
|
71
|
+
|
72
|
+
} // namespace chrome_lang_id
|
@@ -0,0 +1,158 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
// Common feature types for parser components.
|
17
|
+
|
18
|
+
#ifndef FEATURE_TYPES_H_
|
19
|
+
#define FEATURE_TYPES_H_
|
20
|
+
|
21
|
+
#include <algorithm>
|
22
|
+
#include <map>
|
23
|
+
#include <string>
|
24
|
+
#include <utility>
|
25
|
+
|
26
|
+
#include "base.h"
|
27
|
+
|
28
|
+
namespace chrome_lang_id {
|
29
|
+
|
30
|
+
// TODO(djweiss) Clean this up as well.
|
31
|
+
// Use the same type for feature values as is used for predicated.
|
32
|
+
typedef int64 Predicate;
|
33
|
+
typedef Predicate FeatureValue;
|
34
|
+
|
35
|
+
// Each feature value in a feature vector has a feature type. The feature type
|
36
|
+
// is used for converting feature type and value pairs to predicate values. The
|
37
|
+
// feature type can also return names for feature values and calculate the size
|
38
|
+
// of the feature value domain. The FeatureType class is abstract and must be
|
39
|
+
// specialized for the concrete feature types.
|
40
|
+
class FeatureType {
|
41
|
+
public:
|
42
|
+
// Initializes a feature type.
|
43
|
+
explicit FeatureType(const string &name);
|
44
|
+
|
45
|
+
virtual ~FeatureType();
|
46
|
+
|
47
|
+
// Converts a feature value to a name.
|
48
|
+
virtual string GetFeatureValueName(FeatureValue value) const = 0;
|
49
|
+
|
50
|
+
// Returns the size of the feature values domain.
|
51
|
+
virtual int64 GetDomainSize() const = 0;
|
52
|
+
|
53
|
+
// Returns the feature type name.
|
54
|
+
const string &name() const { return name_; }
|
55
|
+
|
56
|
+
Predicate base() const { return base_; }
|
57
|
+
void set_base(Predicate base) { base_ = base; }
|
58
|
+
|
59
|
+
// Returns true iff this feature is continuous; see FloatFeatureValue.
|
60
|
+
bool is_continuous() const { return is_continuous_; }
|
61
|
+
|
62
|
+
private:
|
63
|
+
// Feature type name.
|
64
|
+
string name_;
|
65
|
+
|
66
|
+
// "Base" feature value: i.e. a "slot" in a global ordering of features.
|
67
|
+
Predicate base_;
|
68
|
+
|
69
|
+
// See doc for is_continuous().
|
70
|
+
bool is_continuous_;
|
71
|
+
};
|
72
|
+
|
73
|
+
// Templated generic resource based feature type. This feature type delegates
|
74
|
+
// look up of feature value names to an unknown resource class, which is not
|
75
|
+
// owned. Optionally, this type can also store a mapping of extra values which
|
76
|
+
// are not in the resource.
|
77
|
+
//
|
78
|
+
// Note: this class assumes that Resource->GetFeatureValueName() will return
|
79
|
+
// successfully for values ONLY in the range [0, Resource->NumValues()) Any
|
80
|
+
// feature value not in the extra value map and not in the above range of
|
81
|
+
// Resource will result in a ERROR and return of "<INVALID>".
|
82
|
+
template <class Resource>
|
83
|
+
class ResourceBasedFeatureType : public FeatureType {
|
84
|
+
public:
|
85
|
+
// Creates a new type with given name, resource object, and a mapping of
|
86
|
+
// special values. The values must be greater or equal to
|
87
|
+
// resource->NumValues() so as to avoid collisions; this is verified with
|
88
|
+
// CHECK at creation.
|
89
|
+
ResourceBasedFeatureType(const string &name, const Resource *resource,
|
90
|
+
const std::map<FeatureValue, string> &values);
|
91
|
+
|
92
|
+
// Creates a new type with no special values.
|
93
|
+
ResourceBasedFeatureType(const string &name, const Resource *resource);
|
94
|
+
|
95
|
+
// Returns the feature name for a given feature value. First checks the values
|
96
|
+
// map, then checks the resource to look up the name.
|
97
|
+
string GetFeatureValueName(FeatureValue value) const override {
|
98
|
+
if (values_.find(value) != values_.end()) {
|
99
|
+
return values_.find(value)->second;
|
100
|
+
}
|
101
|
+
if (value >= 0 && value < resource_->NumValues()) {
|
102
|
+
return resource_->GetFeatureValueName(value);
|
103
|
+
} else {
|
104
|
+
return "<INVALID>";
|
105
|
+
}
|
106
|
+
}
|
107
|
+
|
108
|
+
// Returns the number of possible values for this feature type. This is the
|
109
|
+
// based on the largest value that was observed in the extra values.
|
110
|
+
FeatureValue GetDomainSize() const override { return max_value_ + 1; }
|
111
|
+
|
112
|
+
protected:
|
113
|
+
// Shared resource. Not owned.
|
114
|
+
const Resource *resource_ = nullptr;
|
115
|
+
|
116
|
+
// Maximum possible value this feature could take.
|
117
|
+
FeatureValue max_value_;
|
118
|
+
|
119
|
+
// Mapping for extra feature values not in the resource.
|
120
|
+
std::map<FeatureValue, string> values_;
|
121
|
+
};
|
122
|
+
|
123
|
+
// Feature type that is defined using an explicit map from FeatureValue to
|
124
|
+
// string values. This can reduce some of the boilerplate when defining
|
125
|
+
// features that generate enum values. Example usage:
|
126
|
+
//
|
127
|
+
// class BeverageSizeFeature : public FeatureFunction<Beverage>
|
128
|
+
// enum FeatureValue { SMALL, MEDIUM, LARGE }; // values for this feature
|
129
|
+
// void Init(TaskContext *context) override {
|
130
|
+
// set_feature_type(new EnumFeatureType("beverage_size",
|
131
|
+
// {{SMALL, "SMALL"}, {MEDIUM, "MEDIUM"}, {LARGE, "LARGE"}});
|
132
|
+
// }
|
133
|
+
// [...]
|
134
|
+
// };
|
135
|
+
class EnumFeatureType : public FeatureType {
|
136
|
+
public:
|
137
|
+
EnumFeatureType(const string &name,
|
138
|
+
const std::map<FeatureValue, string> &value_names);
|
139
|
+
~EnumFeatureType() override;
|
140
|
+
|
141
|
+
// Returns the feature name for a given feature value.
|
142
|
+
string GetFeatureValueName(FeatureValue value) const override;
|
143
|
+
|
144
|
+
// Returns the number of possible values for this feature type. This is one
|
145
|
+
// greater than the largest value in the value_names map.
|
146
|
+
FeatureValue GetDomainSize() const override;
|
147
|
+
|
148
|
+
protected:
|
149
|
+
// Maximum possible value this feature could take.
|
150
|
+
FeatureValue domain_size_ = 0;
|
151
|
+
|
152
|
+
// Names of feature values.
|
153
|
+
std::map<FeatureValue, string> value_names_;
|
154
|
+
};
|
155
|
+
|
156
|
+
} // namespace chrome_lang_id
|
157
|
+
|
158
|
+
#endif // FEATURE_TYPES_H_
|
@@ -0,0 +1,55 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Routine that maps a Unicode code point to an interchange-valid one
|
17
|
+
//
|
18
|
+
|
19
|
+
#include "fixunicodevalue.h"
|
20
|
+
#include "integral_types.h"
|
21
|
+
|
22
|
+
namespace chrome_lang_id {
|
23
|
+
namespace CLD2 {
|
24
|
+
|
25
|
+
// Guarantees that the resulting output value is interchange valid
|
26
|
+
// 00-FF; map to spaces or MS CP1252
|
27
|
+
// D800-DFFF; surrogates
|
28
|
+
// FDD0-FDEF; non-characters
|
29
|
+
// xxFFFE-xxFFFF; non-characters
|
30
|
+
char32 FixUnicodeValue(char32 uv) {
|
31
|
+
uint32 uuv = static_cast<uint32>(uv);
|
32
|
+
if (uuv < 0x0100) {
|
33
|
+
return kMapFullMicrosoft1252OrSpace[uuv];
|
34
|
+
}
|
35
|
+
if (uuv < 0xD800) {
|
36
|
+
return uv;
|
37
|
+
}
|
38
|
+
if ((uuv & ~0x0F) == 0xFDD0) { // non-characters
|
39
|
+
return 0xFFFD;
|
40
|
+
}
|
41
|
+
if ((uuv & ~0x0F) == 0xFDE0) { // non-characters
|
42
|
+
return 0xFFFD;
|
43
|
+
}
|
44
|
+
if ((uuv & 0x00FFFE) == 0xFFFE) { // non-characters
|
45
|
+
return 0xFFFD;
|
46
|
+
}
|
47
|
+
if ((0xE000 <= uuv) && (uuv <= 0x10FFFF)) {
|
48
|
+
return uv;
|
49
|
+
}
|
50
|
+
// surrogates and negative and > 0x10FFFF all land here
|
51
|
+
return 0xFFFD;
|
52
|
+
}
|
53
|
+
|
54
|
+
} // End namespace CLD2
|
55
|
+
} // End namespace chrome_lang_id
|
@@ -0,0 +1,69 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Routine that maps a Unicode code point to an interchange-valid one
|
17
|
+
//
|
18
|
+
// Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
|
19
|
+
// code points. C0 and C1 control codes that are not interchange-valid
|
20
|
+
// are mapped to spaces.
|
21
|
+
|
22
|
+
|
23
|
+
#ifndef SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
24
|
+
#define SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
25
|
+
|
26
|
+
#include "integral_types.h" // for char32
|
27
|
+
#include "port.h"
|
28
|
+
|
29
|
+
namespace chrome_lang_id {
|
30
|
+
namespace CLD2 {
|
31
|
+
|
32
|
+
// Map byte value 0000-00FF to char32
|
33
|
+
// Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
|
34
|
+
// Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
|
35
|
+
static const char32 kMapFullMicrosoft1252OrSpace[256] = {
|
36
|
+
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
|
37
|
+
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
|
38
|
+
0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
|
39
|
+
0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
|
40
|
+
|
41
|
+
0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
|
42
|
+
0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
|
43
|
+
0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
|
44
|
+
0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
|
45
|
+
|
46
|
+
0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
|
47
|
+
0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
|
48
|
+
0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
|
49
|
+
0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
|
50
|
+
0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
|
51
|
+
0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
|
52
|
+
|
53
|
+
0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
|
54
|
+
0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
|
55
|
+
0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
|
56
|
+
0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
|
57
|
+
};
|
58
|
+
|
59
|
+
// Guarantees that the resulting output value is interchange valid
|
60
|
+
// 00-FF; map to spaces or MS CP1252
|
61
|
+
// D800-DFFF; surrogates
|
62
|
+
// FDD0-FDEF; non-characters
|
63
|
+
// xxFFFE-xxFFFF; non-characters
|
64
|
+
char32 FixUnicodeValue(char32 uv);
|
65
|
+
|
66
|
+
} // End namespace CLD2
|
67
|
+
} // End namespace chrome_lang_id
|
68
|
+
|
69
|
+
#endif // SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
data/ext/cld3/float16.h
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#ifndef FLOAT16_H_
|
17
|
+
#define FLOAT16_H_
|
18
|
+
|
19
|
+
#include <string.h> // for memcpy
|
20
|
+
|
21
|
+
#include "base.h"
|
22
|
+
#include "casts.h"
|
23
|
+
|
24
|
+
namespace chrome_lang_id {
|
25
|
+
|
26
|
+
// Compact 16-bit encoding of floating point numbers. This
|
27
|
+
// representation uses 1 bit for the sign, 8 bits for the exponent and
|
28
|
+
// 7 bits for the mantissa. It is assumed that floats are in IEEE 754
|
29
|
+
// format so a float16 is just bits 16-31 of a single precision float.
|
30
|
+
//
|
31
|
+
// NOTE: The IEEE floating point standard defines a float16 format that
|
32
|
+
// is different than this format (it has fewer bits of exponent and more
|
33
|
+
// bits of mantissa). We don't use that format here because conversion
|
34
|
+
// to/from 32-bit floats is more complex for that format, and the
|
35
|
+
// conversion for this format is very simple.
|
36
|
+
//
|
37
|
+
// <---------float16------------>
|
38
|
+
// s e e e e e e e e f f f f f f f f f f f f f f f f f f f f f f f
|
39
|
+
// <------------------------------float-------------------------->
|
40
|
+
// 3 3 2 2 1 1 0
|
41
|
+
// 1 0 3 2 5 4 0
|
42
|
+
|
43
|
+
typedef uint16 float16;
|
44
|
+
|
45
|
+
static inline float16 Float32To16(float f) {
|
46
|
+
// Note that we just truncate the mantissa bits: we make no effort to
|
47
|
+
// do any smarter rounding.
|
48
|
+
return (lang_id_bit_cast<uint32>(f) >> 16) & 0xffff;
|
49
|
+
}
|
50
|
+
|
51
|
+
static inline float Float16To32(float16 f) {
|
52
|
+
// We fill in the new mantissa bits with 0, and don't do anything smarter.
|
53
|
+
return lang_id_bit_cast<float>(f << 16);
|
54
|
+
}
|
55
|
+
|
56
|
+
} // namespace chrome_lang_id
|
57
|
+
|
58
|
+
#endif // FLOAT16_H_
|