RubyGems - cld3 - Versions diffs - 3.1.0 - Mend

cld3 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

checksums.yaml +7 -0
data/Gemfile +18 -0
data/LICENSE +204 -0
data/LICENSE_CLD3 +203 -0
data/README.md +22 -0
data/cld3.gemspec +35 -0
data/ext/cld3/base.cc +36 -0
data/ext/cld3/base.h +106 -0
data/ext/cld3/casts.h +98 -0
data/ext/cld3/embedding_feature_extractor.cc +51 -0
data/ext/cld3/embedding_feature_extractor.h +182 -0
data/ext/cld3/embedding_network.cc +196 -0
data/ext/cld3/embedding_network.h +186 -0
data/ext/cld3/embedding_network_params.h +285 -0
data/ext/cld3/extconf.rb +49 -0
data/ext/cld3/feature_extractor.cc +137 -0
data/ext/cld3/feature_extractor.h +633 -0
data/ext/cld3/feature_extractor.proto +50 -0
data/ext/cld3/feature_types.cc +72 -0
data/ext/cld3/feature_types.h +158 -0
data/ext/cld3/fixunicodevalue.cc +55 -0
data/ext/cld3/fixunicodevalue.h +69 -0
data/ext/cld3/float16.h +58 -0
data/ext/cld3/fml_parser.cc +308 -0
data/ext/cld3/fml_parser.h +123 -0
data/ext/cld3/generated_entities.cc +296 -0
data/ext/cld3/generated_ulscript.cc +678 -0
data/ext/cld3/generated_ulscript.h +142 -0
data/ext/cld3/getonescriptspan.cc +1109 -0
data/ext/cld3/getonescriptspan.h +124 -0
data/ext/cld3/integral_types.h +37 -0
data/ext/cld3/lang_id_nn_params.cc +57449 -0
data/ext/cld3/lang_id_nn_params.h +178 -0
data/ext/cld3/language_identifier_features.cc +165 -0
data/ext/cld3/language_identifier_features.h +116 -0
data/ext/cld3/nnet_language_identifier.cc +380 -0
data/ext/cld3/nnet_language_identifier.h +175 -0
data/ext/cld3/nnet_language_identifier_c.cc +72 -0
data/ext/cld3/offsetmap.cc +478 -0
data/ext/cld3/offsetmap.h +168 -0
data/ext/cld3/port.h +143 -0
data/ext/cld3/registry.cc +28 -0
data/ext/cld3/registry.h +242 -0
data/ext/cld3/relevant_script_feature.cc +89 -0
data/ext/cld3/relevant_script_feature.h +49 -0
data/ext/cld3/script_detector.h +156 -0
data/ext/cld3/sentence.proto +77 -0
data/ext/cld3/sentence_features.cc +29 -0
data/ext/cld3/sentence_features.h +35 -0
data/ext/cld3/simple_adder.h +72 -0
data/ext/cld3/stringpiece.h +81 -0
data/ext/cld3/task_context.cc +161 -0
data/ext/cld3/task_context.h +81 -0
data/ext/cld3/task_context_params.cc +74 -0
data/ext/cld3/task_context_params.h +54 -0
data/ext/cld3/task_spec.proto +98 -0
data/ext/cld3/text_processing.cc +245 -0
data/ext/cld3/text_processing.h +30 -0
data/ext/cld3/unicodetext.cc +96 -0
data/ext/cld3/unicodetext.h +144 -0
data/ext/cld3/utf8acceptinterchange.h +486 -0
data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
data/ext/cld3/utf8repl_lettermarklower.h +758 -0
data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
data/ext/cld3/utf8statetable.cc +1344 -0
data/ext/cld3/utf8statetable.h +285 -0
data/ext/cld3/utils.cc +241 -0
data/ext/cld3/utils.h +144 -0
data/ext/cld3/workspace.cc +64 -0
data/ext/cld3/workspace.h +177 -0
data/lib/cld3.rb +99 -0
metadata +158 -0

data/ext/cld3/relevant_script_feature.cc ADDED

@@ -0,0 +1,89 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "relevant_script_feature.h"
+#include <ctype.h>
+#include <string>
+#include "feature_extractor.h"
+#include "feature_types.h"
+#include "language_identifier_features.h"
+#include "script_detector.h"
+#include "cld_3/protos/sentence.pb.h"
+#include "sentence_features.h"
+#include "task_context.h"
+#include "utils.h"
+#include "workspace.h"
+namespace chrome_lang_id {
+void RelevantScriptFeature::Setup(TaskContext *context) {
+  // Nothing.
+}
+void RelevantScriptFeature::Init(TaskContext *context) {
+  set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
+}
+void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
+                                     const Sentence &sentence,
+                                     FeatureVector *result) const {
+  const string &text = sentence.text();
+  // We expect kNumRelevantScripts to be small, so we stack-allocate the array
+  // of counts.  Still, if that changes, we want to find out.
+  static_assert(
+      kNumRelevantScripts < 25,
+      "switch counts to vector<int>: too big for stack-allocated int[]");
+  // counts[s] is the number of characters with script s.
+  // Note: {} "value-initializes" the array to zero.
+  int counts[kNumRelevantScripts]{};
+  int total_count = 0;
+  const char *const text_end = text.data() + text.size();
+  for (const char *curr = text.data(); curr < text_end;
+       curr += utils::OneCharLen(curr)) {
+    const int num_bytes = utils::OneCharLen(curr);
+    // If a partial UTF-8 character is encountered, break out of the loop.
+    if (curr + num_bytes > text_end) {
+      break;
+    }
+    // Skip spaces, numbers, punctuation, and all other non-alpha ASCII
+    // characters: these characters are used in so many languages, they do not
+    // communicate language-related information.
+    if ((num_bytes == 1) && !isalpha(*curr)) {
+      continue;
+    }
+    Script script = GetScript(curr, num_bytes);
+    CLD3_DCHECK(script >= 0);
+    CLD3_DCHECK(script < kNumRelevantScripts);
+    counts[static_cast<int>(script)]++;
+    total_count++;
+  }
+  for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
+    int count = counts[script_id];
+    if (count > 0) {
+      const float weight = static_cast<float>(count) / total_count;
+      FloatFeatureValue value(script_id, weight);
+      result->add(feature_type(), value.discrete_value);
+    }
+  }
+}
+}  // namespace chrome_lang_id

data/ext/cld3/relevant_script_feature.h ADDED

@@ -0,0 +1,49 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef RELEVANT_SCRIPT_FEATURE_H_
+#define RELEVANT_SCRIPT_FEATURE_H_
+#include "feature_extractor.h"
+#include "cld_3/protos/sentence.pb.h"
+#include "sentence_features.h"
+#include "task_context.h"
+#include "workspace.h"
+namespace chrome_lang_id {
+// Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode
+// script (see below): each such feature indicates the script and the ratio of
+// UTF8 characters in that script, in the given sentence.
+//
+// What is a relevant script?  Recognizing all 100+ Unicode scripts would
+// require too much code size and runtime.  Instead, we focus only on a few
+// scripts that communicate a lot of language information: e.g., the use of
+// Hiragana characters almost always indicates Japanese, so Hiragana is a
+// "relevant" script for us.  The Latin script is used by dozens of language, so
+// Latin is not relevant in this context.
+class RelevantScriptFeature : public WholeSentenceFeature {
+ public:
+  void Setup(TaskContext *context) override;
+  void Init(TaskContext *context) override;
+  // Appends the features computed from the sentence to the feature vector.
+  void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
+                FeatureVector *result) const override;
+};
+}  // namespace chrome_lang_id
+#endif  // RELEVANT_SCRIPT_FEATURE_H_

data/ext/cld3/script_detector.h ADDED

@@ -0,0 +1,156 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef SCRIPT_DETECTOR_H_
+#define SCRIPT_DETECTOR_H_
+namespace chrome_lang_id {
+// Unicode scripts we care about.  To get compact and fast code, we detect only
+// a few Unicode scripts that offer a strong indication about the language of
+// the text (e.g., Hiragana -> Japanese).
+enum Script {
+  // Special value to indicate internal errors in the script detection code.
+  kScriptError,
+  // Special values for all Unicode scripts that we do not detect.  One special
+  // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
+  // already have that information, we use it).  kScriptOtherUtf8OneByte means
+  // ~Latin and kScriptOtherUtf8FourBytes means ~Han.
+  kScriptOtherUtf8OneByte,
+  kScriptOtherUtf8TwoBytes,
+  kScriptOtherUtf8ThreeBytes,
+  kScriptOtherUtf8FourBytes,
+  kScriptGreek,
+  kScriptCyrillic,
+  kScriptHebrew,
+  kScriptArabic,
+  kScriptHangulJamo,  // Used primarily for Korean.
+  kScriptHiragana,    // Used primarily for Japanese.
+  kScriptKatakana,    // Used primarily for Japanese.
+  // Add new scripts here.
+  // Do not add any script after kNumRelevantScripts.  This value indicates the
+  // number of elements in this enum Script (except this value) such that we can
+  // easily iterate over the scripts.
+  kNumRelevantScripts,
+};
+template <typename IntType>
+inline bool InRange(IntType value, IntType low, IntType hi) {
+  return (value >= low) && (value <= hi);
+}
+// Returns Script for the UTF8 character that starts at address p.
+// Precondition: p points to a valid UTF8 character of num_bytes bytes.
+inline Script GetScript(const unsigned char *p, int num_bytes) {
+  switch (num_bytes) {
+    case 1:
+      return kScriptOtherUtf8OneByte;
+    case 2: {
+      // 2-byte UTF8 characters have 11 bits of information.  unsigned int has
+      // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
+      // it's enough.  It's also usually the fastest int type on the current
+      // CPU, so it's better to use than int32.
+      static const unsigned int kGreekStart = 0x370;
+      // Commented out (unsued in the code): kGreekEnd = 0x3FF;
+      static const unsigned int kCyrillicStart = 0x400;
+      static const unsigned int kCyrillicEnd = 0x4FF;
+      static const unsigned int kHebrewStart = 0x590;
+      // Commented out (unsued in the code): kHebrewEnd = 0x5FF;
+      static const unsigned int kArabicStart = 0x600;
+      static const unsigned int kArabicEnd = 0x6FF;
+      const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
+      if (codepoint > kCyrillicEnd) {
+        if (codepoint >= kArabicStart) {
+          if (codepoint <= kArabicEnd) {
+            return kScriptArabic;
+          }
+        } else {
+          // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
+          // codepoint <= kHebrewEnd.
+          if (codepoint >= kHebrewStart) {
+            return kScriptHebrew;
+          }
+        }
+      } else {
+        if (codepoint >= kCyrillicStart) {
+          return kScriptCyrillic;
+        } else {
+          // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
+          // codepoint <= kGreekEnd.
+          if (codepoint >= kGreekStart) {
+            return kScriptGreek;
+          }
+        }
+      }
+      return kScriptOtherUtf8TwoBytes;
+    }
+    case 3: {
+      // 3-byte UTF8 characters have 16 bits of information.  unsigned int has
+      // at least 16 bits.
+      static const unsigned int kHangulJamoStart = 0x1100;
+      static const unsigned int kHangulJamoEnd = 0x11FF;
+      static const unsigned int kHiraganaStart = 0x3041;
+      static const unsigned int kHiraganaEnd = 0x309F;
+      // Commented out (unsued in the code): kKatakanaStart = 0x30A0;
+      static const unsigned int kKatakanaEnd = 0x30FF;
+      const unsigned int codepoint =
+          ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
+      if (codepoint > kHiraganaEnd) {
+        // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
+        // codepoint >= kKatakanaStart.
+        if (codepoint <= kKatakanaEnd) {
+          return kScriptKatakana;
+        }
+      } else {
+        if (codepoint >= kHiraganaStart) {
+          return kScriptHiragana;
+        } else {
+          if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
+            return kScriptHangulJamo;
+          }
+        }
+      }
+      return kScriptOtherUtf8ThreeBytes;
+    }
+    case 4:
+      return kScriptOtherUtf8FourBytes;
+    default:
+      return kScriptError;
+  }
+}
+// Returns Script for the UTF8 character that starts at address p.  Similar to
+// the previous version of GetScript, except for "char" vs "unsigned char".
+// Most code works with "char *" pointers, ignoring the fact that char is
+// unsigned (by default) on most platforms, but signed on iOS.  This code takes
+// care of making sure we always treat chars as unsigned.
+inline Script GetScript(const char *p, int num_bytes) {
+  return GetScript(reinterpret_cast<const unsigned char *>(p), num_bytes);
+}
+}  // namespace chrome_lang_id
+#endif  // SCRIPT_DETECTOR_H_

data/ext/cld3/sentence.proto ADDED

@@ -0,0 +1,77 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Protocol buffer specification for sentence analysis.
+syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
+package chrome_lang_id;
+// A Sentence contains the raw text contents of a sentence, as well as an
+// analysis.
+message Sentence {
+  // Identifier for sentence.
+  optional string id = 1;
+  // Raw text contents of the sentence.
+  optional string text = 2;
+  // Tokenization of the sentence.
+  repeated Token token = 3;
+  extensions 1000 to max;
+}
+// A sentence token marks a span of bytes in the sentence text as a token
+// or word.
+message Token {
+  // Token word form.
+  required string word = 1;
+  // Start position of token in text.
+  required int32 start = 2;
+  // End position of token in text. Gives index of last byte, not one past
+  // the last byte. If token came from lexer, excludes any trailing HTML tags.
+  required int32 end = 3;
+  // Head of this token in the dependency tree: the id of the token which has an
+  // arc going to this one. If it is the root token of a sentence, then it is
+  // set to -1.
+  optional int32 head = 4 [default = -1];
+  // Part-of-speech tag for token.
+  optional string tag = 5;
+  // Coarse-grained word category for token.
+  optional string category = 6;
+  // Label for dependency relation between this token and its head.
+  optional string label = 7;
+  // Break level for tokens that indicates how it was separated from the
+  // previous token in the text.
+  enum BreakLevel {
+    NO_BREAK = 0;         // No separation between tokens.
+    SPACE_BREAK = 1;      // Tokens separated by space.
+    LINE_BREAK = 2;       // Tokens separated by line break.
+    SENTENCE_BREAK = 3;   // Tokens separated by sentence break.
+  }
+  optional BreakLevel break_level = 8 [default = SPACE_BREAK];
+  extensions 1000 to max;
+}

data/ext/cld3/sentence_features.cc ADDED

@@ -0,0 +1,29 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "sentence_features.h"
+#include "registry.h"
+namespace chrome_lang_id {
+// Declare registry for the whole Sentence feature functions.  NOTE: this is not
+// yet set to anything meaningful.  It will be set so in NNetLanguageIdentifier
+// constructor, *before* we use any feature.
+template <>
+WholeSentenceFeature::Registry
+    *RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
+}  // namespace chrome_lang_id

data/ext/cld3/sentence_features.h ADDED

@@ -0,0 +1,35 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Features that operate on Sentence objects. Most features are defined
+// in this header so they may be re-used via composition into other more
+// advanced feature classes.
+#ifndef SENTENCE_FEATURES_H_
+#define SENTENCE_FEATURES_H_
+#include "feature_extractor.h"
+#include "cld_3/protos/sentence.pb.h"
+namespace chrome_lang_id {
+// Feature function that extracts features for the full Sentence.
+typedef FeatureFunction<Sentence> WholeSentenceFeature;
+typedef FeatureExtractor<Sentence> WholeSentenceExtractor;
+}  // namespace chrome_lang_id
+#endif  // SENTENCE_FEATURES_H_

data/ext/cld3/simple_adder.h ADDED

@@ -0,0 +1,72 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef SIMPLE_ADDER_H_
+#define SIMPLE_ADDER_H_
+#include "base.h"
+namespace chrome_lang_id {
+// Class for adding (possibly) scaled arrays.
+class SimpleAdder {
+ public:
+  static constexpr const int kNumFloatsPerBatch = 1;
+  CLD3_ATTRIBUTE_ALWAYS_INLINE SimpleAdder(float *dest, int num_floats)
+      : dest_(dest), num_floats_(num_floats) {}
+  CLD3_ATTRIBUTE_ALWAYS_INLINE ~SimpleAdder() {
+    // Should call Finalize function before destruction.
+    CLD3_DCHECK(dest_ == nullptr);
+  }
+  // Caller must call this function before calling deconstruct this object.
+  CLD3_ATTRIBUTE_ALWAYS_INLINE void Finalize() { dest_ = nullptr; }
+  CLD3_ATTRIBUTE_ALWAYS_INLINE void LazyAdd(const float *source) const {
+    AddImpl(source, num_floats_, dest_);
+  }
+  CLD3_ATTRIBUTE_ALWAYS_INLINE void LazyScaleAdd(const float *source,
+                                                 const float scale) const {
+    ScaleAddImpl(source, num_floats_, scale, dest_);
+  }
+  // Simple fast while loop to implement dest += source.
+  CLD3_ATTRIBUTE_ALWAYS_INLINE static void AddImpl(
+      const float *__restrict source, uint32 size, float *__restrict dest) {
+    for (uint32 i = 0; i < size; ++i) {
+      dest[i] += source[i];
+    }
+  }
+  // Simple fast while loop to implement dest += scale * source.
+  CLD3_ATTRIBUTE_ALWAYS_INLINE static void ScaleAddImpl(
+      const float *__restrict source, uint32 size, const float scale,
+      float *__restrict dest) {
+    for (uint32 i = 0; i < size; ++i) {
+      dest[i] += source[i] * scale;
+    }
+  }
+ private:
+  float *dest_;
+  int num_floats_;
+};
+}  // namespace chrome_lang_id
+#endif  // SIMPLE_ADDER_H_