cld3 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/LICENSE +204 -0
- data/LICENSE_CLD3 +203 -0
- data/README.md +22 -0
- data/cld3.gemspec +35 -0
- data/ext/cld3/base.cc +36 -0
- data/ext/cld3/base.h +106 -0
- data/ext/cld3/casts.h +98 -0
- data/ext/cld3/embedding_feature_extractor.cc +51 -0
- data/ext/cld3/embedding_feature_extractor.h +182 -0
- data/ext/cld3/embedding_network.cc +196 -0
- data/ext/cld3/embedding_network.h +186 -0
- data/ext/cld3/embedding_network_params.h +285 -0
- data/ext/cld3/extconf.rb +49 -0
- data/ext/cld3/feature_extractor.cc +137 -0
- data/ext/cld3/feature_extractor.h +633 -0
- data/ext/cld3/feature_extractor.proto +50 -0
- data/ext/cld3/feature_types.cc +72 -0
- data/ext/cld3/feature_types.h +158 -0
- data/ext/cld3/fixunicodevalue.cc +55 -0
- data/ext/cld3/fixunicodevalue.h +69 -0
- data/ext/cld3/float16.h +58 -0
- data/ext/cld3/fml_parser.cc +308 -0
- data/ext/cld3/fml_parser.h +123 -0
- data/ext/cld3/generated_entities.cc +296 -0
- data/ext/cld3/generated_ulscript.cc +678 -0
- data/ext/cld3/generated_ulscript.h +142 -0
- data/ext/cld3/getonescriptspan.cc +1109 -0
- data/ext/cld3/getonescriptspan.h +124 -0
- data/ext/cld3/integral_types.h +37 -0
- data/ext/cld3/lang_id_nn_params.cc +57449 -0
- data/ext/cld3/lang_id_nn_params.h +178 -0
- data/ext/cld3/language_identifier_features.cc +165 -0
- data/ext/cld3/language_identifier_features.h +116 -0
- data/ext/cld3/nnet_language_identifier.cc +380 -0
- data/ext/cld3/nnet_language_identifier.h +175 -0
- data/ext/cld3/nnet_language_identifier_c.cc +72 -0
- data/ext/cld3/offsetmap.cc +478 -0
- data/ext/cld3/offsetmap.h +168 -0
- data/ext/cld3/port.h +143 -0
- data/ext/cld3/registry.cc +28 -0
- data/ext/cld3/registry.h +242 -0
- data/ext/cld3/relevant_script_feature.cc +89 -0
- data/ext/cld3/relevant_script_feature.h +49 -0
- data/ext/cld3/script_detector.h +156 -0
- data/ext/cld3/sentence.proto +77 -0
- data/ext/cld3/sentence_features.cc +29 -0
- data/ext/cld3/sentence_features.h +35 -0
- data/ext/cld3/simple_adder.h +72 -0
- data/ext/cld3/stringpiece.h +81 -0
- data/ext/cld3/task_context.cc +161 -0
- data/ext/cld3/task_context.h +81 -0
- data/ext/cld3/task_context_params.cc +74 -0
- data/ext/cld3/task_context_params.h +54 -0
- data/ext/cld3/task_spec.proto +98 -0
- data/ext/cld3/text_processing.cc +245 -0
- data/ext/cld3/text_processing.h +30 -0
- data/ext/cld3/unicodetext.cc +96 -0
- data/ext/cld3/unicodetext.h +144 -0
- data/ext/cld3/utf8acceptinterchange.h +486 -0
- data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/utf8statetable.cc +1344 -0
- data/ext/cld3/utf8statetable.h +285 -0
- data/ext/cld3/utils.cc +241 -0
- data/ext/cld3/utils.h +144 -0
- data/ext/cld3/workspace.cc +64 -0
- data/ext/cld3/workspace.h +177 -0
- data/lib/cld3.rb +99 -0
- metadata +158 -0
data/ext/cld3/utils.h
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#ifndef UTILS_H_
|
17
|
+
#define UTILS_H_
|
18
|
+
|
19
|
+
#include <stddef.h>
|
20
|
+
#include <functional>
|
21
|
+
#include <initializer_list>
|
22
|
+
#include <string>
|
23
|
+
#include <vector>
|
24
|
+
|
25
|
+
#include "base.h"
|
26
|
+
#include "script_span/stringpiece.h"
|
27
|
+
|
28
|
+
namespace chrome_lang_id {
|
29
|
+
namespace utils {
|
30
|
+
|
31
|
+
bool ParseInt32(const char *c_str, int *value);
|
32
|
+
bool ParseDouble(const char *c_str, double *value);
|
33
|
+
|
34
|
+
template <typename T>
|
35
|
+
T ParseUsing(const string &str, std::function<bool(const char *, T *)> func) {
|
36
|
+
T value;
|
37
|
+
func(str.c_str(), &value);
|
38
|
+
return value;
|
39
|
+
}
|
40
|
+
|
41
|
+
template <typename T>
|
42
|
+
T ParseUsing(const string &str, T defval,
|
43
|
+
std::function<bool(const char *, T *)> func) {
|
44
|
+
return str.empty() ? defval : ParseUsing<T>(str, func);
|
45
|
+
}
|
46
|
+
|
47
|
+
string CEscape(const string &src);
|
48
|
+
|
49
|
+
std::vector<string> Split(const string &text, char delim);
|
50
|
+
|
51
|
+
int RemoveLeadingWhitespace(StringPiece *text);
|
52
|
+
|
53
|
+
int RemoveTrailingWhitespace(StringPiece *text);
|
54
|
+
|
55
|
+
int RemoveWhitespaceContext(StringPiece *text);
|
56
|
+
|
57
|
+
uint32 Hash32(const char *data, size_t n, uint32 seed);
|
58
|
+
|
59
|
+
uint32 Hash32WithDefaultSeed(const string &input);
|
60
|
+
|
61
|
+
// Deletes all the elements in an STL container and clears the container. This
|
62
|
+
// function is suitable for use with a vector, set, hash_set, or any other STL
|
63
|
+
// container which defines sensible begin(), end(), and clear() methods.
|
64
|
+
// If container is NULL, this function is a no-op.
|
65
|
+
template <typename T>
|
66
|
+
void STLDeleteElements(T *container) {
|
67
|
+
if (!container) return;
|
68
|
+
auto it = container->begin();
|
69
|
+
while (it != container->end()) {
|
70
|
+
auto temp = it;
|
71
|
+
++it;
|
72
|
+
delete *temp;
|
73
|
+
}
|
74
|
+
container->clear();
|
75
|
+
}
|
76
|
+
|
77
|
+
class PunctuationUtil {
|
78
|
+
public:
|
79
|
+
// Unicode character ranges for punctuation characters according to CoNLL.
|
80
|
+
struct CharacterRange {
|
81
|
+
int first;
|
82
|
+
int last;
|
83
|
+
};
|
84
|
+
static CharacterRange kPunctuation[];
|
85
|
+
|
86
|
+
// Returns true if Unicode character is a punctuation character.
|
87
|
+
static bool IsPunctuation(int u) {
|
88
|
+
int i = 0;
|
89
|
+
while (kPunctuation[i].first > 0) {
|
90
|
+
if (u < kPunctuation[i].first) return false;
|
91
|
+
if (u <= kPunctuation[i].last) return true;
|
92
|
+
++i;
|
93
|
+
}
|
94
|
+
return false;
|
95
|
+
}
|
96
|
+
|
97
|
+
// Determine if tag is a punctuation tag.
|
98
|
+
static bool IsPunctuationTag(const string &tag) {
|
99
|
+
for (size_t i = 0; i < tag.length(); ++i) {
|
100
|
+
int c = tag[i];
|
101
|
+
if (c != ',' && c != ':' && c != '.' && c != '\'' && c != '`') {
|
102
|
+
return false;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
return true;
|
106
|
+
}
|
107
|
+
|
108
|
+
// Returns true if tag is non-empty and has only punctuation or parens
|
109
|
+
// symbols.
|
110
|
+
static bool IsPunctuationTagOrParens(const string &tag) {
|
111
|
+
if (tag.empty()) return false;
|
112
|
+
for (size_t i = 0; i < tag.length(); ++i) {
|
113
|
+
int c = tag[i];
|
114
|
+
if (c != '(' && c != ')' && c != ',' && c != ':' && c != '.' &&
|
115
|
+
c != '\'' && c != '`') {
|
116
|
+
return false;
|
117
|
+
}
|
118
|
+
}
|
119
|
+
return true;
|
120
|
+
}
|
121
|
+
};
|
122
|
+
|
123
|
+
void NormalizeDigits(string *form);
|
124
|
+
|
125
|
+
// Takes a text and convert it into a vector, where each element is a utf8
|
126
|
+
// character.
|
127
|
+
void GetUTF8Chars(const string &text, std::vector<string> *chars);
|
128
|
+
|
129
|
+
// Returns the number of bytes in the first UTF-8 char at the beginning
|
130
|
+
// of the string. It is assumed that the string is valid UTF-8. If
|
131
|
+
// the first byte of the string is null, return 0 (for backwards
|
132
|
+
// compatibility only; this use is discouraged).
|
133
|
+
int UTF8FirstLetterNumBytes(const char *in_buf);
|
134
|
+
|
135
|
+
// Returns the length (number of bytes) of the Unicode code point starting at
|
136
|
+
// src, based on inspecting just that one byte. Preconditions: src != NULL,
|
137
|
+
// *src can be read, and *src is not '\0', and src points to a well-formed UTF-8
|
138
|
+
// string.
|
139
|
+
int OneCharLen(const char *src);
|
140
|
+
|
141
|
+
} // namespace utils
|
142
|
+
} // namespace chrome_lang_id
|
143
|
+
|
144
|
+
#endif // UTILS_H_
|
@@ -0,0 +1,64 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
#include "workspace.h"
|
17
|
+
|
18
|
+
#include "base.h"
|
19
|
+
|
20
|
+
namespace chrome_lang_id {
|
21
|
+
|
22
|
+
WorkspaceSet::WorkspaceSet() {}
|
23
|
+
|
24
|
+
WorkspaceSet::~WorkspaceSet() { Reset(WorkspaceRegistry()); }
|
25
|
+
|
26
|
+
WorkspaceRegistry::WorkspaceRegistry() {}
|
27
|
+
|
28
|
+
WorkspaceRegistry::~WorkspaceRegistry() {}
|
29
|
+
|
30
|
+
string WorkspaceRegistry::DebugString() const {
|
31
|
+
string str;
|
32
|
+
for (auto &it : workspace_names_) {
|
33
|
+
const string &type_name = workspace_types_.at(it.first);
|
34
|
+
for (size_t index = 0; index < it.second.size(); ++index) {
|
35
|
+
const string &workspace_name = it.second[index];
|
36
|
+
str += "\n ";
|
37
|
+
str += type_name;
|
38
|
+
str += " :: ";
|
39
|
+
str += workspace_name;
|
40
|
+
}
|
41
|
+
}
|
42
|
+
return str;
|
43
|
+
}
|
44
|
+
|
45
|
+
VectorIntWorkspace::~VectorIntWorkspace() {}
|
46
|
+
|
47
|
+
VectorIntWorkspace::VectorIntWorkspace(int size) : elements_(size) {}
|
48
|
+
|
49
|
+
VectorIntWorkspace::VectorIntWorkspace(int size, int value)
|
50
|
+
: elements_(size, value) {}
|
51
|
+
|
52
|
+
VectorIntWorkspace::VectorIntWorkspace(const std::vector<int> &elements)
|
53
|
+
: elements_(elements) {}
|
54
|
+
|
55
|
+
string VectorIntWorkspace::TypeName() { return "Vector"; }
|
56
|
+
|
57
|
+
VectorVectorIntWorkspace::~VectorVectorIntWorkspace() {}
|
58
|
+
|
59
|
+
VectorVectorIntWorkspace::VectorVectorIntWorkspace(int size)
|
60
|
+
: elements_(size) {}
|
61
|
+
|
62
|
+
string VectorVectorIntWorkspace::TypeName() { return "VectorVector"; }
|
63
|
+
|
64
|
+
} // namespace chrome_lang_id
|
@@ -0,0 +1,177 @@
|
|
1
|
+
/* Copyright 2016 Google Inc. All Rights Reserved.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
==============================================================================*/
|
15
|
+
|
16
|
+
// Notes on thread-safety: All of the classes here are thread-compatible. More
|
17
|
+
// specifically, the registry machinery is thread-safe, as long as each thread
|
18
|
+
// performs feature extraction on a different Sentence object.
|
19
|
+
|
20
|
+
#ifndef WORKSPACE_H_
|
21
|
+
#define WORKSPACE_H_
|
22
|
+
|
23
|
+
#include <stddef.h>
|
24
|
+
#include <string>
|
25
|
+
#include <typeindex>
|
26
|
+
#include <unordered_map>
|
27
|
+
#include <utility>
|
28
|
+
#include <vector>
|
29
|
+
|
30
|
+
#include "base.h"
|
31
|
+
|
32
|
+
namespace chrome_lang_id {
|
33
|
+
|
34
|
+
// A base class for shared workspaces. Derived classes implement a static member
|
35
|
+
// function TypeName() which returns a human readable string name for the class.
|
36
|
+
class Workspace {
|
37
|
+
public:
|
38
|
+
// Polymorphic destructor.
|
39
|
+
virtual ~Workspace() {}
|
40
|
+
|
41
|
+
protected:
|
42
|
+
// Create an empty workspace.
|
43
|
+
Workspace() {}
|
44
|
+
|
45
|
+
private:
|
46
|
+
CLD3_DISALLOW_COPY_AND_ASSIGN(Workspace);
|
47
|
+
};
|
48
|
+
|
49
|
+
// A registry that keeps track of workspaces.
|
50
|
+
class WorkspaceRegistry {
|
51
|
+
public:
|
52
|
+
// Create an empty registry.
|
53
|
+
WorkspaceRegistry();
|
54
|
+
~WorkspaceRegistry();
|
55
|
+
|
56
|
+
const std::unordered_map<std::type_index, std::vector<std::string>>
|
57
|
+
&WorkspaceNames() const {
|
58
|
+
return workspace_names_;
|
59
|
+
}
|
60
|
+
|
61
|
+
// Returns a string describing the registered workspaces.
|
62
|
+
string DebugString() const;
|
63
|
+
|
64
|
+
private:
|
65
|
+
// Workspace type names, indexed as workspace_types_[typeid].
|
66
|
+
std::unordered_map<std::type_index, string> workspace_types_;
|
67
|
+
|
68
|
+
// Workspace names, indexed as workspace_names_[typeid][workspace].
|
69
|
+
std::unordered_map<std::type_index, std::vector<string>> workspace_names_;
|
70
|
+
|
71
|
+
CLD3_DISALLOW_COPY_AND_ASSIGN(WorkspaceRegistry);
|
72
|
+
};
|
73
|
+
|
74
|
+
// A typed collected of workspaces. The workspaces are indexed according to an
|
75
|
+
// external WorkspaceRegistry. If the WorkspaceSet is const, the contents are
|
76
|
+
// also immutable.
|
77
|
+
class WorkspaceSet {
|
78
|
+
public:
|
79
|
+
WorkspaceSet();
|
80
|
+
~WorkspaceSet();
|
81
|
+
|
82
|
+
void Reset(const WorkspaceRegistry ®istry) {
|
83
|
+
// Deallocate current workspaces.
|
84
|
+
for (auto &it : workspaces_) {
|
85
|
+
for (size_t index = 0; index < it.second.size(); ++index) {
|
86
|
+
delete it.second[index];
|
87
|
+
}
|
88
|
+
}
|
89
|
+
workspaces_.clear();
|
90
|
+
|
91
|
+
// Allocate space for new workspaces.
|
92
|
+
for (auto &it : registry.WorkspaceNames()) {
|
93
|
+
workspaces_[it.first].resize(it.second.size());
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
private:
|
98
|
+
// The set of workspaces, indexed as workspaces_[typeid][index].
|
99
|
+
std::unordered_map<std::type_index, std::vector<Workspace *>> workspaces_;
|
100
|
+
};
|
101
|
+
|
102
|
+
// A workspace that wraps around a single int.
|
103
|
+
class SingletonIntWorkspace : public Workspace {
|
104
|
+
public:
|
105
|
+
// Default-initializes the int value.
|
106
|
+
SingletonIntWorkspace() {}
|
107
|
+
|
108
|
+
// Initializes the int with the given value.
|
109
|
+
explicit SingletonIntWorkspace(int value) : value_(value) {}
|
110
|
+
|
111
|
+
// Returns the name of this type of workspace.
|
112
|
+
static string TypeName() { return "SingletonInt"; }
|
113
|
+
|
114
|
+
// Returns the int value.
|
115
|
+
int get() const { return value_; }
|
116
|
+
|
117
|
+
// Sets the int value.
|
118
|
+
void set(int value) { value_ = value; }
|
119
|
+
|
120
|
+
private:
|
121
|
+
// The enclosed int.
|
122
|
+
int value_ = 0;
|
123
|
+
};
|
124
|
+
|
125
|
+
// A workspace that wraps around a vector of int.
|
126
|
+
class VectorIntWorkspace : public Workspace {
|
127
|
+
public:
|
128
|
+
// Creates a vector of the given size.
|
129
|
+
explicit VectorIntWorkspace(int size);
|
130
|
+
|
131
|
+
// Creates a vector initialized with the given array.
|
132
|
+
explicit VectorIntWorkspace(const std::vector<int> &elements);
|
133
|
+
|
134
|
+
// Creates a vector of the given size, with each element initialized to the
|
135
|
+
// given value.
|
136
|
+
VectorIntWorkspace(int size, int value);
|
137
|
+
|
138
|
+
~VectorIntWorkspace() override;
|
139
|
+
|
140
|
+
// Returns the name of this type of workspace.
|
141
|
+
static string TypeName();
|
142
|
+
|
143
|
+
// Returns the i'th element.
|
144
|
+
int element(int i) const { return elements_[i]; }
|
145
|
+
|
146
|
+
// Sets the i'th element.
|
147
|
+
void set_element(int i, int value) { elements_[i] = value; }
|
148
|
+
|
149
|
+
private:
|
150
|
+
// The enclosed vector.
|
151
|
+
std::vector<int> elements_;
|
152
|
+
};
|
153
|
+
|
154
|
+
// A workspace that wraps around a vector of vector of int.
|
155
|
+
class VectorVectorIntWorkspace : public Workspace {
|
156
|
+
public:
|
157
|
+
// Creates a vector of empty vectors of the given size.
|
158
|
+
explicit VectorVectorIntWorkspace(int size);
|
159
|
+
~VectorVectorIntWorkspace() override;
|
160
|
+
|
161
|
+
// Returns the name of this type of workspace.
|
162
|
+
static string TypeName();
|
163
|
+
|
164
|
+
// Returns the i'th vector of elements.
|
165
|
+
const std::vector<int> &elements(int i) const { return elements_[i]; }
|
166
|
+
|
167
|
+
// Mutable access to the i'th vector of elements.
|
168
|
+
std::vector<int> *mutable_elements(int i) { return &(elements_[i]); }
|
169
|
+
|
170
|
+
private:
|
171
|
+
// The enclosed vector of vector of elements.
|
172
|
+
std::vector<std::vector<int>> elements_;
|
173
|
+
};
|
174
|
+
|
175
|
+
} // namespace chrome_lang_id
|
176
|
+
|
177
|
+
#endif // WORKSPACE_H_
|
data/lib/cld3.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
# File including an implementation of CLD3 module. Some documentations are
|
2
|
+
# extracted from ext/cld3/ext/src/nnet_language_identifier.h.
|
3
|
+
#
|
4
|
+
# Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
|
5
|
+
# All Rights Reserved.
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
8
|
+
# you may not use this file except in compliance with the License.
|
9
|
+
# You may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
# See the License for the specific language governing permissions and
|
17
|
+
# limitations under the License.
|
18
|
+
# ==============================================================================
|
19
|
+
|
20
|
+
require "ffi"
|
21
|
+
|
22
|
+
# Module providing an interface for Compact Language Detector v3 (CLD3)
|
23
|
+
module CLD3
|
24
|
+
# Class for detecting the language of a document.
|
25
|
+
class NNetLanguageIdentifier
|
26
|
+
# Min number of bytes needed to make a prediction if the construcotr is
|
27
|
+
# called without the corresponding parameter.
|
28
|
+
MIN_NUM_BYTES_TO_CONSIDER = 140
|
29
|
+
|
30
|
+
# Max number of bytes needed to make a prediction if the construcotr is
|
31
|
+
# called without the corresponding parameter.
|
32
|
+
MAX_NUM_BYTES_TO_CONSIDER = 700
|
33
|
+
|
34
|
+
# Max number of input bytes to process.
|
35
|
+
MAX_NUM_INPUT_BYTES_TO_CONSIDER = 10000
|
36
|
+
|
37
|
+
# Predictions with probability greater than or equal to this threshold are
|
38
|
+
# marked as reliable. This threshold was optimized on a set of text segments
|
39
|
+
# extracted from wikipedia, and results in an overall precision, recall,
|
40
|
+
# and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
|
41
|
+
RELIABILITY_THRESHOLD = 0.7
|
42
|
+
|
43
|
+
# Reliability threshold for the languages hr and bs.
|
44
|
+
RELIABILITY_HR_BS_THRESHOLD = 0.5
|
45
|
+
|
46
|
+
# Information about a predicted language.
|
47
|
+
Result = Struct.new("Result", :language, :probability, :reliable?, :proportion)
|
48
|
+
|
49
|
+
def initialize(minNumBytes = MIN_NUM_BYTES_TO_CONSIDER, maxNumBytes = MAX_NUM_BYTES_TO_CONSIDER)
|
50
|
+
@cc = Pointer.new(CLD3::Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
|
51
|
+
end
|
52
|
+
|
53
|
+
# Finds the most likely language for the given text, along with additional
|
54
|
+
# information (e.g., probability). The prediction is based on the first N
|
55
|
+
# bytes where N is the minumum between the number of interchange valid UTF8
|
56
|
+
# bytes and max_num_bytes_. If N is less than min_num_bytes_ long, then this
|
57
|
+
# function returns nil.
|
58
|
+
def find_language(text)
|
59
|
+
text_utf8 = text.encode(Encoding::UTF_8)
|
60
|
+
pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
|
61
|
+
pointer.put_bytes(0, text_utf8)
|
62
|
+
|
63
|
+
cc_result = CLD3::Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
|
64
|
+
language = cc_result[:language_data].read_bytes(cc_result[:language_size])
|
65
|
+
|
66
|
+
Result.new(
|
67
|
+
language == "und" ? nil : language,
|
68
|
+
cc_result[:probability],
|
69
|
+
cc_result[:reliable?],
|
70
|
+
cc_result[:proportion])
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
class Pointer < FFI::AutoPointer
|
76
|
+
def self.release(pointer)
|
77
|
+
CLD3::Unstable.delete_NNetLanguageIdentifier(pointer)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Do NOT use this module from outside.
|
83
|
+
module Unstable
|
84
|
+
extend FFI::Library
|
85
|
+
|
86
|
+
ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", FFI.map_library_name("cld3"))
|
87
|
+
|
88
|
+
class NNetLanguageIdentifierResult < FFI::Struct
|
89
|
+
layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
|
90
|
+
end
|
91
|
+
|
92
|
+
attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
|
93
|
+
|
94
|
+
attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
|
95
|
+
|
96
|
+
attach_function :NNetLanguageIdentifier_find_language,
|
97
|
+
[ :pointer, :buffer_in, :size_t ], NNetLanguageIdentifierResult.by_value
|
98
|
+
end
|
99
|
+
end
|