cld3 3.2.5 → 3.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/{ext/cld3/ext/LICENSE → LICENSE_CLD3} +0 -0
- data/cld3.gemspec +1 -1
- data/ext/cld3/Makefile +266 -0
- data/ext/cld3/{ext/src/base.cc → base.cc} +0 -0
- data/ext/cld3/{ext/src/base.h → base.h} +0 -0
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/{ext/src/casts.h → casts.h} +0 -0
- data/ext/cld3/{ext/src/embedding_feature_extractor.cc → embedding_feature_extractor.cc} +0 -0
- data/ext/cld3/{ext/src/embedding_feature_extractor.h → embedding_feature_extractor.h} +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/{ext/src/embedding_network.cc → embedding_network.cc} +0 -0
- data/ext/cld3/{ext/src/embedding_network.h → embedding_network.h} +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/{ext/src/embedding_network_params.h → embedding_network_params.h} +0 -0
- data/ext/cld3/{ext/src/feature_extractor.cc → feature_extractor.cc} +0 -0
- data/ext/cld3/{ext/src/feature_extractor.h → feature_extractor.h} +0 -0
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/{ext/src/feature_extractor.proto → feature_extractor.proto} +0 -0
- data/ext/cld3/{ext/src/feature_types.cc → feature_types.cc} +0 -0
- data/ext/cld3/{ext/src/feature_types.h → feature_types.h} +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/{ext/src/script_span/fixunicodevalue.cc → fixunicodevalue.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/fixunicodevalue.h → fixunicodevalue.h} +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/{ext/src/float16.h → float16.h} +0 -0
- data/ext/cld3/{ext/src/fml_parser.cc → fml_parser.cc} +0 -0
- data/ext/cld3/{ext/src/fml_parser.h → fml_parser.h} +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/{ext/src/script_span/generated_entities.cc → generated_entities.cc} +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/{ext/src/script_span/generated_ulscript.cc → generated_ulscript.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/generated_ulscript.h → generated_ulscript.h} +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/{ext/src/script_span/getonescriptspan.cc → getonescriptspan.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/getonescriptspan.h → getonescriptspan.h} +0 -0
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/{ext/src/script_span/integral_types.h → integral_types.h} +0 -0
- data/ext/cld3/{ext/src/lang_id_nn_params.cc → lang_id_nn_params.cc} +0 -0
- data/ext/cld3/{ext/src/lang_id_nn_params.h → lang_id_nn_params.h} +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/{ext/src/language_identifier_features.cc → language_identifier_features.cc} +0 -0
- data/ext/cld3/{ext/src/language_identifier_features.h → language_identifier_features.h} +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/mkmf.log +36 -0
- data/ext/cld3/{ext/src/nnet_language_identifier.cc → nnet_language_identifier.cc} +0 -0
- data/ext/cld3/{ext/src/nnet_language_identifier.h → nnet_language_identifier.h} +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/{ext/src/script_span/offsetmap.cc → offsetmap.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/offsetmap.h → offsetmap.h} +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/{ext/src/script_span/port.h → port.h} +0 -0
- data/ext/cld3/{ext/src/registry.cc → registry.cc} +0 -0
- data/ext/cld3/{ext/src/registry.h → registry.h} +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/{ext/src/relevant_script_feature.cc → relevant_script_feature.cc} +0 -0
- data/ext/cld3/{ext/src/relevant_script_feature.h → relevant_script_feature.h} +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/{ext/src/script_detector.h → script_detector.h} +0 -0
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/{ext/src/sentence.proto → sentence.proto} +0 -0
- data/ext/cld3/{ext/src/sentence_features.cc → sentence_features.cc} +0 -0
- data/ext/cld3/{ext/src/sentence_features.h → sentence_features.h} +0 -0
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/{ext/src/simple_adder.h → simple_adder.h} +0 -0
- data/ext/cld3/{ext/src/script_span/stringpiece.h → stringpiece.h} +0 -0
- data/ext/cld3/{ext/src/task_context.cc → task_context.cc} +0 -0
- data/ext/cld3/{ext/src/task_context.h → task_context.h} +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/{ext/src/task_context_params.cc → task_context_params.cc} +0 -0
- data/ext/cld3/{ext/src/task_context_params.h → task_context_params.h} +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/{ext/src/task_spec.proto → task_spec.proto} +0 -0
- data/ext/cld3/{ext/src/script_span/text_processing.cc → text_processing.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/text_processing.h → text_processing.h} +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/{ext/src/unicodetext.cc → unicodetext.cc} +0 -0
- data/ext/cld3/{ext/src/unicodetext.h → unicodetext.h} +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/{ext/src/script_span/utf8acceptinterchange.h → utf8acceptinterchange.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8prop_lettermarkscriptnum.h → utf8prop_lettermarkscriptnum.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8repl_lettermarklower.h → utf8repl_lettermarklower.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8scannot_lettermarkspecial.h → utf8scannot_lettermarkspecial.h} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8statetable.cc → utf8statetable.cc} +0 -0
- data/ext/cld3/{ext/src/script_span/utf8statetable.h → utf8statetable.h} +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/{ext/src/utils.cc → utils.cc} +0 -0
- data/ext/cld3/{ext/src/utils.h → utils.h} +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/{ext/src/workspace.cc → workspace.cc} +0 -0
- data/ext/cld3/{ext/src/workspace.h → workspace.h} +0 -0
- data/ext/cld3/workspace.o +0 -0
- metadata +96 -81
- data/ext/cld3/ext/CMakeLists.txt +0 -69
- data/ext/cld3/ext/CONTRIBUTING.md +0 -26
- data/ext/cld3/ext/README.md +0 -73
- data/ext/cld3/ext/misc/myprotobuf.cmake +0 -58
- data/ext/cld3/ext/model.png +0 -0
- data/ext/cld3/ext/src/BUILD.gn +0 -133
- data/ext/cld3/ext/src/DEPS +0 -4
- data/ext/cld3/ext/src/language_identifier_features_test.cc +0 -261
- data/ext/cld3/ext/src/language_identifier_main.cc +0 -54
- data/ext/cld3/ext/src/nnet_lang_id_test.cc +0 -254
- data/ext/cld3/ext/src/nnet_lang_id_test_data.cc +0 -529
- data/ext/cld3/ext/src/nnet_lang_id_test_data.h +0 -117
- data/ext/cld3/ext/src/relevant_script_feature_test.cc +0 -259
- data/ext/cld3/ext/src/script_detector_test.cc +0 -161
- data/ext/cld3/ext/src/script_span/README.md +0 -11
- data/ext/cld3/ext/src/script_span/getonescriptspan_test.cc +0 -135
File without changes
|
File without changes
|
File without changes
|
Binary file
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
Binary file
|
File without changes
|
File without changes
|
Binary file
|
Binary file
|
File without changes
|
File without changes
|
File without changes
|
Binary file
|
File without changes
|
File without changes
|
Binary file
|
File without changes
|
data/ext/cld3/{ext/src/script_span/utf8prop_lettermarkscriptnum.h → utf8prop_lettermarkscriptnum.h}
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
Binary file
|
File without changes
|
File without changes
|
data/ext/cld3/utils.o
ADDED
Binary file
|
File without changes
|
File without changes
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cld3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.2.
|
4
|
+
version: 3.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Akihiko Odaki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -60,89 +60,104 @@ extra_rdoc_files: []
|
|
60
60
|
files:
|
61
61
|
- Gemfile
|
62
62
|
- LICENSE
|
63
|
+
- LICENSE_CLD3
|
63
64
|
- README.md
|
64
65
|
- cld3.gemspec
|
65
|
-
- ext/cld3/
|
66
|
-
- ext/cld3/
|
67
|
-
- ext/cld3/
|
68
|
-
- ext/cld3/
|
69
|
-
- ext/cld3/
|
70
|
-
- ext/cld3/
|
71
|
-
- ext/cld3/
|
72
|
-
- ext/cld3/
|
73
|
-
- ext/cld3/
|
74
|
-
- ext/cld3/
|
75
|
-
- ext/cld3/
|
76
|
-
- ext/cld3/
|
77
|
-
- ext/cld3/ext/src/embedding_feature_extractor.h
|
78
|
-
- ext/cld3/ext/src/embedding_network.cc
|
79
|
-
- ext/cld3/ext/src/embedding_network.h
|
80
|
-
- ext/cld3/ext/src/embedding_network_params.h
|
81
|
-
- ext/cld3/ext/src/feature_extractor.cc
|
82
|
-
- ext/cld3/ext/src/feature_extractor.h
|
83
|
-
- ext/cld3/ext/src/feature_extractor.proto
|
84
|
-
- ext/cld3/ext/src/feature_types.cc
|
85
|
-
- ext/cld3/ext/src/feature_types.h
|
86
|
-
- ext/cld3/ext/src/float16.h
|
87
|
-
- ext/cld3/ext/src/fml_parser.cc
|
88
|
-
- ext/cld3/ext/src/fml_parser.h
|
89
|
-
- ext/cld3/ext/src/lang_id_nn_params.cc
|
90
|
-
- ext/cld3/ext/src/lang_id_nn_params.h
|
91
|
-
- ext/cld3/ext/src/language_identifier_features.cc
|
92
|
-
- ext/cld3/ext/src/language_identifier_features.h
|
93
|
-
- ext/cld3/ext/src/language_identifier_features_test.cc
|
94
|
-
- ext/cld3/ext/src/language_identifier_main.cc
|
95
|
-
- ext/cld3/ext/src/nnet_lang_id_test.cc
|
96
|
-
- ext/cld3/ext/src/nnet_lang_id_test_data.cc
|
97
|
-
- ext/cld3/ext/src/nnet_lang_id_test_data.h
|
98
|
-
- ext/cld3/ext/src/nnet_language_identifier.cc
|
99
|
-
- ext/cld3/ext/src/nnet_language_identifier.h
|
100
|
-
- ext/cld3/ext/src/registry.cc
|
101
|
-
- ext/cld3/ext/src/registry.h
|
102
|
-
- ext/cld3/ext/src/relevant_script_feature.cc
|
103
|
-
- ext/cld3/ext/src/relevant_script_feature.h
|
104
|
-
- ext/cld3/ext/src/relevant_script_feature_test.cc
|
105
|
-
- ext/cld3/ext/src/script_detector.h
|
106
|
-
- ext/cld3/ext/src/script_detector_test.cc
|
107
|
-
- ext/cld3/ext/src/script_span/README.md
|
108
|
-
- ext/cld3/ext/src/script_span/fixunicodevalue.cc
|
109
|
-
- ext/cld3/ext/src/script_span/fixunicodevalue.h
|
110
|
-
- ext/cld3/ext/src/script_span/generated_entities.cc
|
111
|
-
- ext/cld3/ext/src/script_span/generated_ulscript.cc
|
112
|
-
- ext/cld3/ext/src/script_span/generated_ulscript.h
|
113
|
-
- ext/cld3/ext/src/script_span/getonescriptspan.cc
|
114
|
-
- ext/cld3/ext/src/script_span/getonescriptspan.h
|
115
|
-
- ext/cld3/ext/src/script_span/getonescriptspan_test.cc
|
116
|
-
- ext/cld3/ext/src/script_span/integral_types.h
|
117
|
-
- ext/cld3/ext/src/script_span/offsetmap.cc
|
118
|
-
- ext/cld3/ext/src/script_span/offsetmap.h
|
119
|
-
- ext/cld3/ext/src/script_span/port.h
|
120
|
-
- ext/cld3/ext/src/script_span/stringpiece.h
|
121
|
-
- ext/cld3/ext/src/script_span/text_processing.cc
|
122
|
-
- ext/cld3/ext/src/script_span/text_processing.h
|
123
|
-
- ext/cld3/ext/src/script_span/utf8acceptinterchange.h
|
124
|
-
- ext/cld3/ext/src/script_span/utf8prop_lettermarkscriptnum.h
|
125
|
-
- ext/cld3/ext/src/script_span/utf8repl_lettermarklower.h
|
126
|
-
- ext/cld3/ext/src/script_span/utf8scannot_lettermarkspecial.h
|
127
|
-
- ext/cld3/ext/src/script_span/utf8statetable.cc
|
128
|
-
- ext/cld3/ext/src/script_span/utf8statetable.h
|
129
|
-
- ext/cld3/ext/src/sentence.proto
|
130
|
-
- ext/cld3/ext/src/sentence_features.cc
|
131
|
-
- ext/cld3/ext/src/sentence_features.h
|
132
|
-
- ext/cld3/ext/src/simple_adder.h
|
133
|
-
- ext/cld3/ext/src/task_context.cc
|
134
|
-
- ext/cld3/ext/src/task_context.h
|
135
|
-
- ext/cld3/ext/src/task_context_params.cc
|
136
|
-
- ext/cld3/ext/src/task_context_params.h
|
137
|
-
- ext/cld3/ext/src/task_spec.proto
|
138
|
-
- ext/cld3/ext/src/unicodetext.cc
|
139
|
-
- ext/cld3/ext/src/unicodetext.h
|
140
|
-
- ext/cld3/ext/src/utils.cc
|
141
|
-
- ext/cld3/ext/src/utils.h
|
142
|
-
- ext/cld3/ext/src/workspace.cc
|
143
|
-
- ext/cld3/ext/src/workspace.h
|
66
|
+
- ext/cld3/Makefile
|
67
|
+
- ext/cld3/base.cc
|
68
|
+
- ext/cld3/base.h
|
69
|
+
- ext/cld3/base.o
|
70
|
+
- ext/cld3/casts.h
|
71
|
+
- ext/cld3/embedding_feature_extractor.cc
|
72
|
+
- ext/cld3/embedding_feature_extractor.h
|
73
|
+
- ext/cld3/embedding_feature_extractor.o
|
74
|
+
- ext/cld3/embedding_network.cc
|
75
|
+
- ext/cld3/embedding_network.h
|
76
|
+
- ext/cld3/embedding_network.o
|
77
|
+
- ext/cld3/embedding_network_params.h
|
144
78
|
- ext/cld3/extconf.rb
|
79
|
+
- ext/cld3/feature_extractor.cc
|
80
|
+
- ext/cld3/feature_extractor.h
|
81
|
+
- ext/cld3/feature_extractor.o
|
82
|
+
- ext/cld3/feature_extractor.pb.o
|
83
|
+
- ext/cld3/feature_extractor.proto
|
84
|
+
- ext/cld3/feature_types.cc
|
85
|
+
- ext/cld3/feature_types.h
|
86
|
+
- ext/cld3/feature_types.o
|
87
|
+
- ext/cld3/fixunicodevalue.cc
|
88
|
+
- ext/cld3/fixunicodevalue.h
|
89
|
+
- ext/cld3/fixunicodevalue.o
|
90
|
+
- ext/cld3/float16.h
|
91
|
+
- ext/cld3/fml_parser.cc
|
92
|
+
- ext/cld3/fml_parser.h
|
93
|
+
- ext/cld3/fml_parser.o
|
94
|
+
- ext/cld3/generated_entities.cc
|
95
|
+
- ext/cld3/generated_entities.o
|
96
|
+
- ext/cld3/generated_ulscript.cc
|
97
|
+
- ext/cld3/generated_ulscript.h
|
98
|
+
- ext/cld3/generated_ulscript.o
|
99
|
+
- ext/cld3/getonescriptspan.cc
|
100
|
+
- ext/cld3/getonescriptspan.h
|
101
|
+
- ext/cld3/getonescriptspan.o
|
102
|
+
- ext/cld3/integral_types.h
|
103
|
+
- ext/cld3/lang_id_nn_params.cc
|
104
|
+
- ext/cld3/lang_id_nn_params.h
|
105
|
+
- ext/cld3/lang_id_nn_params.o
|
106
|
+
- ext/cld3/language_identifier_features.cc
|
107
|
+
- ext/cld3/language_identifier_features.h
|
108
|
+
- ext/cld3/language_identifier_features.o
|
109
|
+
- ext/cld3/libcld3.so
|
110
|
+
- ext/cld3/mkmf.log
|
111
|
+
- ext/cld3/nnet_language_identifier.cc
|
112
|
+
- ext/cld3/nnet_language_identifier.h
|
113
|
+
- ext/cld3/nnet_language_identifier.o
|
145
114
|
- ext/cld3/nnet_language_identifier_c.cc
|
115
|
+
- ext/cld3/nnet_language_identifier_c.o
|
116
|
+
- ext/cld3/offsetmap.cc
|
117
|
+
- ext/cld3/offsetmap.h
|
118
|
+
- ext/cld3/offsetmap.o
|
119
|
+
- ext/cld3/port.h
|
120
|
+
- ext/cld3/registry.cc
|
121
|
+
- ext/cld3/registry.h
|
122
|
+
- ext/cld3/registry.o
|
123
|
+
- ext/cld3/relevant_script_feature.cc
|
124
|
+
- ext/cld3/relevant_script_feature.h
|
125
|
+
- ext/cld3/relevant_script_feature.o
|
126
|
+
- ext/cld3/script_detector.h
|
127
|
+
- ext/cld3/sentence.pb.o
|
128
|
+
- ext/cld3/sentence.proto
|
129
|
+
- ext/cld3/sentence_features.cc
|
130
|
+
- ext/cld3/sentence_features.h
|
131
|
+
- ext/cld3/sentence_features.o
|
132
|
+
- ext/cld3/simple_adder.h
|
133
|
+
- ext/cld3/stringpiece.h
|
134
|
+
- ext/cld3/task_context.cc
|
135
|
+
- ext/cld3/task_context.h
|
136
|
+
- ext/cld3/task_context.o
|
137
|
+
- ext/cld3/task_context_params.cc
|
138
|
+
- ext/cld3/task_context_params.h
|
139
|
+
- ext/cld3/task_context_params.o
|
140
|
+
- ext/cld3/task_spec.pb.o
|
141
|
+
- ext/cld3/task_spec.proto
|
142
|
+
- ext/cld3/text_processing.cc
|
143
|
+
- ext/cld3/text_processing.h
|
144
|
+
- ext/cld3/text_processing.o
|
145
|
+
- ext/cld3/unicodetext.cc
|
146
|
+
- ext/cld3/unicodetext.h
|
147
|
+
- ext/cld3/unicodetext.o
|
148
|
+
- ext/cld3/utf8acceptinterchange.h
|
149
|
+
- ext/cld3/utf8prop_lettermarkscriptnum.h
|
150
|
+
- ext/cld3/utf8repl_lettermarklower.h
|
151
|
+
- ext/cld3/utf8scannot_lettermarkspecial.h
|
152
|
+
- ext/cld3/utf8statetable.cc
|
153
|
+
- ext/cld3/utf8statetable.h
|
154
|
+
- ext/cld3/utf8statetable.o
|
155
|
+
- ext/cld3/utils.cc
|
156
|
+
- ext/cld3/utils.h
|
157
|
+
- ext/cld3/utils.o
|
158
|
+
- ext/cld3/workspace.cc
|
159
|
+
- ext/cld3/workspace.h
|
160
|
+
- ext/cld3/workspace.o
|
146
161
|
- lib/cld3.rb
|
147
162
|
homepage: https://github.com/akihikodaki/cld3-ruby
|
148
163
|
licenses:
|
data/ext/cld3/ext/CMakeLists.txt
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
# This cmake scripts only builds a static cld3 lib and the unittests.
|
2
|
-
|
3
|
-
project(cld3)
|
4
|
-
|
5
|
-
# Old versions of cmake dont search/find protobuf lite
|
6
|
-
cmake_minimum_required(VERSION 3.9)
|
7
|
-
|
8
|
-
find_package(Protobuf REQUIRED)
|
9
|
-
message(STATUS "Protobuf_FOUND= ${Protobuf_FOUND}")
|
10
|
-
message(STATUS "Protobuf_VERSION= ${Protobuf_VERSION}")
|
11
|
-
message(WARNING "Protobuf 2.5 and CLD3 seems happy together. This script does NOT check if your verison of protobuf is compatible.")
|
12
|
-
message(STATUS "Protobuf_LIBRARIES= ${Protobuf_LIBRARIES}")
|
13
|
-
message(STATUS "Protobuf_LITE_LIBRARIES= ${Protobuf_LITE_LIBRARIES}") # Usually /usr/lib64/libprotobuf-lite.so
|
14
|
-
|
15
|
-
# By default, protobuf_generate_cpp generates pb.* files directy in the cmake build dir.
|
16
|
-
# But CLD3 sources have been coded using hard coded pathes to cld_3/protos/*.pb.h.
|
17
|
-
# So *.pb.h must be output to cld_3/protos.
|
18
|
-
# For that, let's use a custom my_protobuf_generate_cpp:
|
19
|
-
include(${CMAKE_CURRENT_SOURCE_DIR}/misc/myprotobuf.cmake)
|
20
|
-
my_protobuf_generate_cpp(cld_3/protos PROTO_SRCS PROTO_HDRS src/feature_extractor.proto src/sentence.proto src/task_spec.proto)
|
21
|
-
message(STATUS "PROTO_HDRS= ${PROTO_HDRS}")
|
22
|
-
|
23
|
-
add_definitions(-fPIC) # Position Independant Code
|
24
|
-
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
|
25
|
-
add_definitions(-std=c++11) # Needed for std::to_string(), ...
|
26
|
-
|
27
|
-
include_directories(${CMAKE_CURRENT_BINARY_DIR}) # needed to include generated pb headers
|
28
|
-
|
29
|
-
add_library(${PROJECT_NAME}
|
30
|
-
${PROTO_SRCS} ${PROTO_HDRS}
|
31
|
-
src/base.cc
|
32
|
-
src/embedding_feature_extractor.cc
|
33
|
-
src/embedding_network.cc
|
34
|
-
src/feature_extractor.cc
|
35
|
-
src/feature_extractor.h
|
36
|
-
src/feature_types.cc
|
37
|
-
src/fml_parser.cc
|
38
|
-
src/language_identifier_features.cc
|
39
|
-
src/lang_id_nn_params.cc
|
40
|
-
src/nnet_language_identifier.cc
|
41
|
-
src/registry.cc
|
42
|
-
src/relevant_script_feature.cc
|
43
|
-
src/sentence_features.cc
|
44
|
-
src/task_context.cc
|
45
|
-
src/task_context_params.cc
|
46
|
-
src/unicodetext.cc
|
47
|
-
src/utils.cc
|
48
|
-
src/workspace.cc
|
49
|
-
|
50
|
-
src/script_span/generated_entities.cc
|
51
|
-
src/script_span/getonescriptspan.cc
|
52
|
-
src/script_span/getonescriptspan.h
|
53
|
-
src/script_span/getonescriptspan_test.cc
|
54
|
-
src/script_span/utf8statetable.cc
|
55
|
-
src/script_span/offsetmap.cc
|
56
|
-
src/script_span/text_processing.cc
|
57
|
-
src/script_span/text_processing.h
|
58
|
-
src/script_span/fixunicodevalue.cc
|
59
|
-
)
|
60
|
-
|
61
|
-
# unit tests exec:
|
62
|
-
add_executable(language_identifier_main src/language_identifier_main.cc)
|
63
|
-
target_link_libraries(language_identifier_main cld3 ${Protobuf_LITE_LIBRARIES})
|
64
|
-
|
65
|
-
add_executable(getonescriptspan_test src/script_span/getonescriptspan_test.cc)
|
66
|
-
target_link_libraries(getonescriptspan_test cld3 ${Protobuf_LITE_LIBRARIES})
|
67
|
-
|
68
|
-
add_executable(language_identifier_features_test src/language_identifier_features_test.cc)
|
69
|
-
target_link_libraries(language_identifier_features_test cld3 ${Protobuf_LITE_LIBRARIES})
|
@@ -1,26 +0,0 @@
|
|
1
|
-
Want to contribute? Great! First, read this page (including the small print at
|
2
|
-
the end).
|
3
|
-
|
4
|
-
### Before you contribute
|
5
|
-
Before we can use your code, you must sign the
|
6
|
-
[Google Individual Contributor License Agreement](https://cla.developers.google.com/about/google-individual)
|
7
|
-
(CLA), which you can do online. The CLA is necessary mainly because you own the
|
8
|
-
copyright to your changes, even after your contribution becomes part of our
|
9
|
-
codebase, so we need your permission to use and distribute your code. We also
|
10
|
-
need to be sure of various other things—for instance that you'll tell us if you
|
11
|
-
know that your code infringes on other people's patents. You don't have to sign
|
12
|
-
the CLA until after you've submitted your code for review and a member has
|
13
|
-
approved it, but you must do it before we can put your code into our codebase.
|
14
|
-
Before you start working on a larger contribution, you should get in touch with
|
15
|
-
us first through the issue tracker with your idea so that we can help out and
|
16
|
-
possibly guide you. Coordinating up front makes it much easier to avoid
|
17
|
-
frustration later on.
|
18
|
-
|
19
|
-
### Code reviews
|
20
|
-
All submissions, including submissions by project members, require review. We
|
21
|
-
use Github pull requests for this purpose.
|
22
|
-
|
23
|
-
### The small print
|
24
|
-
Contributions made by corporations are covered by a different agreement than
|
25
|
-
the one above, the
|
26
|
-
[Software Grant and Corporate Contributor License Agreement](https://cla.developers.google.com/about/google-corporate).
|
data/ext/cld3/ext/README.md
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
# Compact Language Detector v3 (CLD3)
|
2
|
-
|
3
|
-
* [Model](#model)
|
4
|
-
* [Installation](#installation)
|
5
|
-
* [Contact](#contact)
|
6
|
-
* [Credits](#credits)
|
7
|
-
|
8
|
-
### Model
|
9
|
-
|
10
|
-
CLD3 is a neural network model for language identification. This package
|
11
|
-
contains the inference code and a trained model. The inference code
|
12
|
-
extracts character ngrams from the input text and computes the fraction
|
13
|
-
of times each of them appears. For example, as shown in the figure below,
|
14
|
-
if the input text is "banana", then one of the extracted trigrams is "ana"
|
15
|
-
and the corresponding fraction is 2/4. The ngrams are hashed down to an id
|
16
|
-
within a small range, and each id is represented by a dense embedding vector
|
17
|
-
estimated during training.
|
18
|
-
|
19
|
-
The model averages the embeddings corresponding to each ngram type according
|
20
|
-
to the fractions, and the averaged embeddings are concatenated to produce
|
21
|
-
the embedding layer. The remaining components of the network are a hidden
|
22
|
-
(Rectified linear) layer and a softmax layer.
|
23
|
-
|
24
|
-
To get a language prediction for the input text, we simply perform a forward
|
25
|
-
pass through the network.
|
26
|
-
|
27
|
-
![Figure](model.png "CLD3")
|
28
|
-
|
29
|
-
### Installation
|
30
|
-
CLD3 is designed to run in the Chrome browser, so it relies on code in
|
31
|
-
[Chromium](http://www.chromium.org/).
|
32
|
-
The steps for building and running the demo of the language detection model are:
|
33
|
-
|
34
|
-
- [check out](http://www.chromium.org/developers/how-tos/get-the-code) the
|
35
|
-
Chromium repository.
|
36
|
-
- copy the code to `//third_party/cld_3`
|
37
|
-
- Uncomment `language_identifier_main` executable in `src/BUILD.gn`.
|
38
|
-
- build and run the model using the commands:
|
39
|
-
|
40
|
-
```shell
|
41
|
-
gn gen out/Default
|
42
|
-
ninja -C out/Default third_party/cld_3/src/src:language_identifier_main
|
43
|
-
out/Default/language_identifier_main
|
44
|
-
```
|
45
|
-
### Bugs and Feature Requests
|
46
|
-
|
47
|
-
Open a [GitHub issue](https://github.com/google/cld3/issues) for this repository to file bugs and feature requests.
|
48
|
-
|
49
|
-
### Announcements and Discussion
|
50
|
-
|
51
|
-
For announcements regarding major updates as well as general discussion list, please subscribe to:
|
52
|
-
[cld3-users@googlegroups.com](https://groups.google.com/forum/#!forum/cld3-users)
|
53
|
-
|
54
|
-
### Credits
|
55
|
-
|
56
|
-
Original authors of the code in this package include (in alphabetical order):
|
57
|
-
|
58
|
-
* Alex Salcianu
|
59
|
-
* Andy Golding
|
60
|
-
* Anton Bakalov
|
61
|
-
* Chris Alberti
|
62
|
-
* Daniel Andor
|
63
|
-
* David Weiss
|
64
|
-
* Emily Pitler
|
65
|
-
* Greg Coppola
|
66
|
-
* Jason Riesa
|
67
|
-
* Kuzman Ganchev
|
68
|
-
* Michael Ringgaard
|
69
|
-
* Nan Hua
|
70
|
-
* Ryan McDonald
|
71
|
-
* Slav Petrov
|
72
|
-
* Stefan Istrate
|
73
|
-
* Terry Koo
|
@@ -1,58 +0,0 @@
|
|
1
|
-
# Special PROTOBUF_GENERATE_CPP which allows to set the output folder:
|
2
|
-
# From https://stackoverflow.com/users/1600278/akira-okumura
|
3
|
-
|
4
|
-
function(MY_PROTOBUF_GENERATE_CPP PATH SRCS HDRS)
|
5
|
-
if(NOT ARGN)
|
6
|
-
message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files")
|
7
|
-
return()
|
8
|
-
endif()
|
9
|
-
|
10
|
-
if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
|
11
|
-
# Create an include path for each file specified
|
12
|
-
foreach(FIL ${ARGN})
|
13
|
-
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
|
14
|
-
get_filename_component(ABS_PATH ${ABS_FIL} PATH)
|
15
|
-
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
|
16
|
-
if(${_contains_already} EQUAL -1)
|
17
|
-
list(APPEND _protobuf_include_path -I ${ABS_PATH})
|
18
|
-
endif()
|
19
|
-
endforeach()
|
20
|
-
else()
|
21
|
-
set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
|
22
|
-
endif()
|
23
|
-
|
24
|
-
if(DEFINED PROTOBUF_IMPORT_DIRS)
|
25
|
-
foreach(DIR ${PROTOBUF_IMPORT_DIRS})
|
26
|
-
get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
|
27
|
-
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
|
28
|
-
if(${_contains_already} EQUAL -1)
|
29
|
-
list(APPEND _protobuf_include_path -I ${ABS_PATH})
|
30
|
-
endif()
|
31
|
-
endforeach()
|
32
|
-
endif()
|
33
|
-
|
34
|
-
set(${SRCS})
|
35
|
-
set(${HDRS})
|
36
|
-
foreach(FIL ${ARGN})
|
37
|
-
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
|
38
|
-
get_filename_component(FIL_WE ${FIL} NAME_WE)
|
39
|
-
|
40
|
-
list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc")
|
41
|
-
list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h")
|
42
|
-
|
43
|
-
execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${PATH})
|
44
|
-
|
45
|
-
add_custom_command(
|
46
|
-
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc"
|
47
|
-
"${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h"
|
48
|
-
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
|
49
|
-
ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/${PATH} ${_protobuf_include_path} ${ABS_FIL}
|
50
|
-
DEPENDS ${ABS_FIL}
|
51
|
-
COMMENT "Running C++ protocol buffer compiler on ${FIL}"
|
52
|
-
VERBATIM )
|
53
|
-
endforeach()
|
54
|
-
|
55
|
-
set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
|
56
|
-
set(${SRCS} ${${SRCS}} PARENT_SCOPE)
|
57
|
-
set(${HDRS} ${${HDRS}} PARENT_SCOPE)
|
58
|
-
endfunction()
|