cld3 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +18 -0
  3. data/LICENSE +204 -0
  4. data/LICENSE_CLD3 +203 -0
  5. data/README.md +22 -0
  6. data/cld3.gemspec +35 -0
  7. data/ext/cld3/base.cc +36 -0
  8. data/ext/cld3/base.h +106 -0
  9. data/ext/cld3/casts.h +98 -0
  10. data/ext/cld3/embedding_feature_extractor.cc +51 -0
  11. data/ext/cld3/embedding_feature_extractor.h +182 -0
  12. data/ext/cld3/embedding_network.cc +196 -0
  13. data/ext/cld3/embedding_network.h +186 -0
  14. data/ext/cld3/embedding_network_params.h +285 -0
  15. data/ext/cld3/extconf.rb +49 -0
  16. data/ext/cld3/feature_extractor.cc +137 -0
  17. data/ext/cld3/feature_extractor.h +633 -0
  18. data/ext/cld3/feature_extractor.proto +50 -0
  19. data/ext/cld3/feature_types.cc +72 -0
  20. data/ext/cld3/feature_types.h +158 -0
  21. data/ext/cld3/fixunicodevalue.cc +55 -0
  22. data/ext/cld3/fixunicodevalue.h +69 -0
  23. data/ext/cld3/float16.h +58 -0
  24. data/ext/cld3/fml_parser.cc +308 -0
  25. data/ext/cld3/fml_parser.h +123 -0
  26. data/ext/cld3/generated_entities.cc +296 -0
  27. data/ext/cld3/generated_ulscript.cc +678 -0
  28. data/ext/cld3/generated_ulscript.h +142 -0
  29. data/ext/cld3/getonescriptspan.cc +1109 -0
  30. data/ext/cld3/getonescriptspan.h +124 -0
  31. data/ext/cld3/integral_types.h +37 -0
  32. data/ext/cld3/lang_id_nn_params.cc +57449 -0
  33. data/ext/cld3/lang_id_nn_params.h +178 -0
  34. data/ext/cld3/language_identifier_features.cc +165 -0
  35. data/ext/cld3/language_identifier_features.h +116 -0
  36. data/ext/cld3/nnet_language_identifier.cc +380 -0
  37. data/ext/cld3/nnet_language_identifier.h +175 -0
  38. data/ext/cld3/nnet_language_identifier_c.cc +72 -0
  39. data/ext/cld3/offsetmap.cc +478 -0
  40. data/ext/cld3/offsetmap.h +168 -0
  41. data/ext/cld3/port.h +143 -0
  42. data/ext/cld3/registry.cc +28 -0
  43. data/ext/cld3/registry.h +242 -0
  44. data/ext/cld3/relevant_script_feature.cc +89 -0
  45. data/ext/cld3/relevant_script_feature.h +49 -0
  46. data/ext/cld3/script_detector.h +156 -0
  47. data/ext/cld3/sentence.proto +77 -0
  48. data/ext/cld3/sentence_features.cc +29 -0
  49. data/ext/cld3/sentence_features.h +35 -0
  50. data/ext/cld3/simple_adder.h +72 -0
  51. data/ext/cld3/stringpiece.h +81 -0
  52. data/ext/cld3/task_context.cc +161 -0
  53. data/ext/cld3/task_context.h +81 -0
  54. data/ext/cld3/task_context_params.cc +74 -0
  55. data/ext/cld3/task_context_params.h +54 -0
  56. data/ext/cld3/task_spec.proto +98 -0
  57. data/ext/cld3/text_processing.cc +245 -0
  58. data/ext/cld3/text_processing.h +30 -0
  59. data/ext/cld3/unicodetext.cc +96 -0
  60. data/ext/cld3/unicodetext.h +144 -0
  61. data/ext/cld3/utf8acceptinterchange.h +486 -0
  62. data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
  63. data/ext/cld3/utf8repl_lettermarklower.h +758 -0
  64. data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
  65. data/ext/cld3/utf8statetable.cc +1344 -0
  66. data/ext/cld3/utf8statetable.h +285 -0
  67. data/ext/cld3/utils.cc +241 -0
  68. data/ext/cld3/utils.h +144 -0
  69. data/ext/cld3/workspace.cc +64 -0
  70. data/ext/cld3/workspace.h +177 -0
  71. data/lib/cld3.rb +99 -0
  72. metadata +158 -0
@@ -0,0 +1,35 @@
1
+ # Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
2
+ # All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #==============================================================================
16
+
17
+ Gem::Specification.new do |gem|
18
+ gem.name = "cld3"
19
+ gem.version = "3.1.0"
20
+ gem.summary = "Compact Language Detector v3 (CLD3)"
21
+ gem.description = "Compact Language Detector v3 (CLD3) is a neural network model for language identification."
22
+ gem.license = "Apache-2.0"
23
+ gem.homepage = "https://github.com/akihikodaki/cld3-ruby"
24
+ gem.author = "Akihiko Odaki"
25
+ gem.email = "akihiko.odaki.4i@stu.hosei.ac.jp"
26
+ gem.required_ruby_version = [ ">= 2.3.0", "< 2.5.0" ]
27
+ gem.add_dependency "ffi", [ ">= 1.1.0", "< 1.10.0" ]
28
+ gem.add_development_dependency "rspec", [ ">=2.11.0", "< 3.7.0" ]
29
+ gem.files = Dir[
30
+ "Gemfile", "LICENSE", "LICENSE_CLD3", "README.md",
31
+ "cld3.gemspec", "ext/**/*", "lib/**/*"
32
+ ]
33
+ gem.require_paths = [ "lib" ]
34
+ gem.extensions = [ "ext/cld3/extconf.rb" ]
35
+ end
@@ -0,0 +1,36 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #include "base.h"
17
+
18
+ #include <string>
19
+ #ifdef COMPILER_MSVC
20
+ #include <sstream>
21
+ #endif // COMPILER_MSVC
22
+
23
+ namespace chrome_lang_id {
24
+
25
+ // TODO(abakalov): Pick the most efficient approach.
26
+ #ifdef COMPILER_MSVC
27
+ std::string Int64ToString(int64 input) {
28
+ std::stringstream stream;
29
+ stream << input;
30
+ return stream.str();
31
+ }
32
+ #else
33
+ std::string Int64ToString(int64 input) { return std::to_string(input); }
34
+ #endif // COMPILER_MSVC
35
+
36
+ } // namespace chrome_lang_id
@@ -0,0 +1,106 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #ifndef BASE_H_
17
+ #define BASE_H_
18
+
19
+ #include <cassert>
20
+ #include <map>
21
+ #include <string>
22
+ #include <vector>
23
+
24
+ namespace chrome_lang_id {
25
+
26
+ using std::vector;
27
+ using std::string;
28
+ using std::map;
29
+ using std::pair;
30
+ typedef unsigned int uint32;
31
+
32
+ #if LANG_CXX11
33
+ #define CLD3_DISALLOW_COPY_AND_ASSIGN(TypeName) \
34
+ TypeName(const TypeName &) = delete; \
35
+ TypeName &operator=(const TypeName &) = delete
36
+ #else // C++98 case follows
37
+
38
+ // Note that these C++98 implementations cannot completely disallow copying,
39
+ // as members and friends can still accidentally make elided copies without
40
+ // triggering a linker error.
41
+ #define CLD3_DISALLOW_COPY_AND_ASSIGN(TypeName) \
42
+ TypeName(const TypeName &); \
43
+ TypeName &operator=(const TypeName &)
44
+ #endif // LANG_CXX11
45
+
46
+ #ifndef CLD3_IMMEDIATE_CRASH
47
+ #if defined(__GNUC__) || defined(__clang__)
48
+ #define CLD3_IMMEDIATE_CRASH() __builtin_trap()
49
+ #else
50
+ #define CLD3_IMMEDIATE_CRASH() ((void)(*(volatile char *)0 = 0))
51
+ #endif
52
+ #endif // CLD3_IMMEDIATE_CRASH
53
+
54
+ #define CLD3_CHECK(f) (!(f) ? CLD3_IMMEDIATE_CRASH() : (void)0)
55
+
56
+ #if defined(NDEBUG) && !defined(DCHECK_ALWAYS_ON)
57
+ #define CLD3_DCHECK(f) ((void)0)
58
+ #else
59
+ #define CLD3_DCHECK(f) CLD3_CHECK(f)
60
+ #endif
61
+
62
+ #ifndef SWIG
63
+ typedef int int32;
64
+ typedef unsigned char uint8; // NOLINT
65
+ typedef unsigned short uint16; // NOLINT
66
+
67
+ // A type to represent a Unicode code-point value. As of Unicode 4.0,
68
+ // such values require up to 21 bits.
69
+ // (For type-checking on pointers, make this explicitly signed,
70
+ // and it should always be the signed version of whatever int32 is.)
71
+ typedef signed int char32;
72
+ #endif // SWIG
73
+
74
+ #ifdef COMPILER_MSVC
75
+ typedef __int64 int64;
76
+ #else
77
+ typedef long long int64; // NOLINT
78
+ #endif // COMPILER_MSVC
79
+
80
+ #if defined(__GNUC__) && \
81
+ (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
82
+
83
+ // For functions we want to force inline.
84
+ // Introduced in gcc 3.1.
85
+ #define CLD3_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
86
+
87
+ #elif defined(_MSC_VER)
88
+ #define CLD3_ATTRIBUTE_ALWAYS_INLINE __forceinline
89
+ #else
90
+
91
+ // Other compilers will have to figure it out for themselves.
92
+ #define CLD3_ATTRIBUTE_ALWAYS_INLINE
93
+ #endif
94
+
95
+ #ifdef INTERNAL_BUILD
96
+ typedef basic_string<char> bstring;
97
+ #else
98
+ typedef std::basic_string<char> bstring;
99
+ #endif // INTERNAL_BUILD
100
+
101
+ // Converts int64 to string.
102
+ std::string Int64ToString(int64 input);
103
+
104
+ } // namespace chrome_lang_id
105
+
106
+ #endif // BASE_H_
@@ -0,0 +1,98 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ // This code is compiled directly on many platforms, including client
17
+ // platforms like Windows, Mac, and embedded systems. Before making
18
+ // any changes here, make sure that you're not breaking any platforms.
19
+ //
20
+
21
+ #ifndef CASTS_H_
22
+ #define CASTS_H_
23
+
24
+ #include <string.h> // for memcpy
25
+
26
+ namespace chrome_lang_id {
27
+
28
+ // lang_id_bit_cast<Dest,Source> is a template function that implements the
29
+ // equivalent of "*reinterpret_cast<Dest*>(&source)". We need this in
30
+ // very low-level functions like the protobuf library and fast math
31
+ // support.
32
+ //
33
+ // float f = 3.14159265358979;
34
+ // int i = lang_id_bit_cast<int32>(f);
35
+ // // i = 0x40490fdb
36
+ //
37
+ // The classical address-casting method is:
38
+ //
39
+ // // WRONG
40
+ // float f = 3.14159265358979; // WRONG
41
+ // int i = * reinterpret_cast<int*>(&f); // WRONG
42
+ //
43
+ // The address-casting method actually produces undefined behavior
44
+ // according to ISO C++ specification section 3.10 -15 -. Roughly, this
45
+ // section says: if an object in memory has one type, and a program
46
+ // accesses it with a different type, then the result is undefined
47
+ // behavior for most values of "different type".
48
+ //
49
+ // This is true for any cast syntax, either *(int*)&f or
50
+ // *reinterpret_cast<int*>(&f). And it is particularly true for
51
+ // conversions between integral lvalues and floating-point lvalues.
52
+ //
53
+ // The purpose of 3.10 -15- is to allow optimizing compilers to assume
54
+ // that expressions with different types refer to different memory. gcc
55
+ // 4.0.1 has an optimizer that takes advantage of this. So a
56
+ // non-conforming program quietly produces wildly incorrect output.
57
+ //
58
+ // The problem is not the use of reinterpret_cast. The problem is type
59
+ // punning: holding an object in memory of one type and reading its bits
60
+ // back using a different type.
61
+ //
62
+ // The C++ standard is more subtle and complex than this, but that
63
+ // is the basic idea.
64
+ //
65
+ // Anyways ...
66
+ //
67
+ // lang_id_bit_cast<> calls memcpy() which is blessed by the standard,
68
+ // especially by the example in section 3.9 . Also, of course,
69
+ // lang_id_bit_cast<> wraps up the nasty logic in one place.
70
+ //
71
+ // Fortunately memcpy() is very fast. In optimized mode, with a
72
+ // constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
73
+ // code with the minimal amount of data movement. On a 32-bit system,
74
+ // memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
75
+ // compiles to two loads and two stores.
76
+ //
77
+ // I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
78
+ //
79
+ // WARNING: if Dest or Source is a non-POD type, the result of the memcpy
80
+ // is likely to surprise you.
81
+ //
82
+ // Props to Bill Gibbons for the compile time assertion technique and
83
+ // Art Komninos and Igor Tandetnik for the msvc experiments.
84
+ //
85
+ // -- mec 2005-10-17
86
+
87
+ template <class Dest, class Source>
88
+ inline Dest lang_id_bit_cast(const Source &source) {
89
+ static_assert(sizeof(Dest) == sizeof(Source), "Sizes do not match");
90
+
91
+ Dest dest;
92
+ memcpy(&dest, &source, sizeof(dest));
93
+ return dest;
94
+ }
95
+
96
+ } // namespace chrome_lang_id
97
+
98
+ #endif // CASTS_H_
@@ -0,0 +1,51 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #include "embedding_feature_extractor.h"
17
+
18
+ #include <stddef.h>
19
+ #include <vector>
20
+
21
+ #include "feature_extractor.h"
22
+ #include "feature_types.h"
23
+ #include "task_context.h"
24
+ #include "utils.h"
25
+
26
+ namespace chrome_lang_id {
27
+
28
+ GenericEmbeddingFeatureExtractor::GenericEmbeddingFeatureExtractor() {}
29
+
30
+ GenericEmbeddingFeatureExtractor::~GenericEmbeddingFeatureExtractor() {}
31
+
32
+ void GenericEmbeddingFeatureExtractor::Setup(TaskContext *context) {
33
+ // Don't use version to determine how to get feature FML.
34
+ string features_param = ArgPrefix();
35
+ features_param += "_features";
36
+ const string features = context->Get(features_param, "");
37
+ const string embedding_names =
38
+ context->Get(GetParamName("embedding_names"), "");
39
+ const string embedding_dims =
40
+ context->Get(GetParamName("embedding_dims"), "");
41
+ embedding_fml_ = utils::Split(features, ';');
42
+ add_strings_ = context->Get(GetParamName("add_varlen_strings"), false);
43
+ embedding_names_ = utils::Split(embedding_names, ';');
44
+ for (const string &dim : utils::Split(embedding_dims, ';')) {
45
+ embedding_dims_.push_back(utils::ParseUsing<int>(dim, utils::ParseInt32));
46
+ }
47
+ }
48
+
49
+ void GenericEmbeddingFeatureExtractor::Init(TaskContext *context) {}
50
+
51
+ } // namespace chrome_lang_id
@@ -0,0 +1,182 @@
1
+ /* Copyright 2016 Google Inc. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ ==============================================================================*/
15
+
16
+ #ifndef EMBEDDING_FEATURE_EXTRACTOR_H_
17
+ #define EMBEDDING_FEATURE_EXTRACTOR_H_
18
+
19
+ #include <memory>
20
+ #include <string>
21
+ #include <vector>
22
+
23
+ #include "feature_extractor.h"
24
+ #include "task_context.h"
25
+ #include "workspace.h"
26
+
27
+ namespace chrome_lang_id {
28
+
29
+ // An EmbeddingFeatureExtractor manages the extraction of features for
30
+ // embedding-based models. It wraps a sequence of underlying classes of feature
31
+ // extractors, along with associated predicate maps. Each class of feature
32
+ // extractors is associated with a name, e.g., "unigrams", "bigrams".
33
+ //
34
+ // The class is split between a generic abstract version,
35
+ // GenericEmbeddingFeatureExtractor (that can be initialized without knowing the
36
+ // signature of the ExtractFeatures method) and a typed version.
37
+ //
38
+ // The predicate maps must be initialized before use: they can be loaded using
39
+ // Read() or updated via UpdateMapsForExample.
40
+ class GenericEmbeddingFeatureExtractor {
41
+ public:
42
+ GenericEmbeddingFeatureExtractor();
43
+ virtual ~GenericEmbeddingFeatureExtractor();
44
+
45
+ // Get the prefix string to put in front of all arguments, so they don't
46
+ // conflict with other embedding models.
47
+ virtual const string ArgPrefix() const = 0;
48
+
49
+ // Sets up predicate maps and embedding space names that are common for all
50
+ // embedding based feature extractors.
51
+ virtual void Setup(TaskContext *context);
52
+ virtual void Init(TaskContext *context);
53
+
54
+ // Requests workspace for the underlying feature extractors. This is
55
+ // implemented in the typed class.
56
+ virtual void RequestWorkspaces(WorkspaceRegistry *registry) = 0;
57
+
58
+ // Number of predicates for the embedding at a given index (vocabulary size.)
59
+ int EmbeddingSize(int index) const {
60
+ return generic_feature_extractor(index).GetDomainSize();
61
+ }
62
+
63
+ // Returns number of embedding spaces.
64
+ int NumEmbeddings() const { return embedding_dims_.size(); }
65
+
66
+ // Returns the number of features in the embedding space.
67
+ int FeatureSize(int idx) const {
68
+ return generic_feature_extractor(idx).feature_types();
69
+ }
70
+
71
+ // Returns the dimensionality of the embedding space.
72
+ int EmbeddingDims(int index) const { return embedding_dims_[index]; }
73
+
74
+ // Accessor for embedding dims (dimensions of the embedding spaces).
75
+ const std::vector<int> &embedding_dims() const { return embedding_dims_; }
76
+
77
+ const std::vector<string> &embedding_fml() const { return embedding_fml_; }
78
+
79
+ // Get parameter name by concatenating the prefix and the original name.
80
+ string GetParamName(const string &param_name) const {
81
+ string name = ArgPrefix();
82
+ name += "_";
83
+ name += param_name;
84
+ return name;
85
+ }
86
+
87
+ protected:
88
+ // Provides the generic class with access to the templated extractors. This is
89
+ // used to get the type information out of the feature extractor without
90
+ // knowing the specific calling arguments of the extractor itself.
91
+ virtual const GenericFeatureExtractor &generic_feature_extractor(
92
+ int idx) const = 0;
93
+
94
+ private:
95
+ // Embedding space names for parameter sharing.
96
+ std::vector<string> embedding_names_;
97
+
98
+ // FML strings for each feature extractor.
99
+ std::vector<string> embedding_fml_;
100
+
101
+ // Size of each of the embedding spaces (maximum predicate id).
102
+ std::vector<int> embedding_sizes_;
103
+
104
+ // Embedding dimensions of the embedding spaces (i.e. 32, 64 etc.)
105
+ std::vector<int> embedding_dims_;
106
+
107
+ // Whether or not to add string descriptions to converted examples.
108
+ bool add_strings_;
109
+ };
110
+
111
+ // Templated, object-specific implementation of the
112
+ // EmbeddingFeatureExtractor. EXTRACTOR should be a FeatureExtractor<OBJ,
113
+ // ARGS...> class that has the appropriate FeatureTraits() to ensure that
114
+ // locator type features work.
115
+ //
116
+ // Note: for backwards compatibility purposes, this always reads the FML spec
117
+ // from "<prefix>_features".
118
+ template <class EXTRACTOR, class OBJ, class... ARGS>
119
+ class EmbeddingFeatureExtractor : public GenericEmbeddingFeatureExtractor {
120
+ public:
121
+ // Sets up all predicate maps, feature extractors, and flags.
122
+ void Setup(TaskContext *context) override {
123
+ GenericEmbeddingFeatureExtractor::Setup(context);
124
+ feature_extractors_.resize(embedding_fml().size());
125
+ for (size_t i = 0; i < embedding_fml().size(); ++i) {
126
+ feature_extractors_[i].Parse(embedding_fml()[i]);
127
+ feature_extractors_[i].Setup(context);
128
+ }
129
+ }
130
+
131
+ // Initializes resources needed by the feature extractors.
132
+ void Init(TaskContext *context) override {
133
+ GenericEmbeddingFeatureExtractor::Init(context);
134
+ for (auto &feature_extractor : feature_extractors_) {
135
+ feature_extractor.Init(context);
136
+ }
137
+ }
138
+
139
+ // Requests workspaces from the registry. Must be called after Init(), and
140
+ // before Preprocess().
141
+ void RequestWorkspaces(WorkspaceRegistry *registry) override {
142
+ for (auto &feature_extractor : feature_extractors_) {
143
+ feature_extractor.RequestWorkspaces(registry);
144
+ }
145
+ }
146
+
147
+ // Must be called on the object one state for each sentence, before any
148
+ // feature extraction (e.g., UpdateMapsForExample, ExtractSparseFeatures).
149
+ void Preprocess(WorkspaceSet *workspaces, OBJ *obj) const {
150
+ for (auto &feature_extractor : feature_extractors_) {
151
+ feature_extractor.Preprocess(workspaces, obj);
152
+ }
153
+ }
154
+
155
+ // Extracts features using the extractors. Note that features must already
156
+ // be initialized to the correct number of feature extractors. No predicate
157
+ // mapping is applied.
158
+ void ExtractFeatures(const WorkspaceSet &workspaces, const OBJ &obj,
159
+ ARGS... args,
160
+ std::vector<FeatureVector> *features) const {
161
+ for (size_t i = 0; i < feature_extractors_.size(); ++i) {
162
+ features->at(i).clear();
163
+ feature_extractors_.at(i).ExtractFeatures(workspaces, obj, args...,
164
+ &features->at(i));
165
+ }
166
+ }
167
+
168
+ protected:
169
+ // Provides generic access to the feature extractors.
170
+ const GenericFeatureExtractor &generic_feature_extractor(
171
+ int idx) const override {
172
+ return feature_extractors_.at(idx);
173
+ }
174
+
175
+ private:
176
+ // Templated feature extractor class.
177
+ std::vector<EXTRACTOR> feature_extractors_;
178
+ };
179
+
180
+ } // namespace chrome_lang_id
181
+
182
+ #endif // EMBEDDING_FEATURE_EXTRACTOR_H_