RubyGems - cld3 - Versions diffs - 3.1.0 - Mend

cld3 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

checksums.yaml +7 -0
data/Gemfile +18 -0
data/LICENSE +204 -0
data/LICENSE_CLD3 +203 -0
data/README.md +22 -0
data/cld3.gemspec +35 -0
data/ext/cld3/base.cc +36 -0
data/ext/cld3/base.h +106 -0
data/ext/cld3/casts.h +98 -0
data/ext/cld3/embedding_feature_extractor.cc +51 -0
data/ext/cld3/embedding_feature_extractor.h +182 -0
data/ext/cld3/embedding_network.cc +196 -0
data/ext/cld3/embedding_network.h +186 -0
data/ext/cld3/embedding_network_params.h +285 -0
data/ext/cld3/extconf.rb +49 -0
data/ext/cld3/feature_extractor.cc +137 -0
data/ext/cld3/feature_extractor.h +633 -0
data/ext/cld3/feature_extractor.proto +50 -0
data/ext/cld3/feature_types.cc +72 -0
data/ext/cld3/feature_types.h +158 -0
data/ext/cld3/fixunicodevalue.cc +55 -0
data/ext/cld3/fixunicodevalue.h +69 -0
data/ext/cld3/float16.h +58 -0
data/ext/cld3/fml_parser.cc +308 -0
data/ext/cld3/fml_parser.h +123 -0
data/ext/cld3/generated_entities.cc +296 -0
data/ext/cld3/generated_ulscript.cc +678 -0
data/ext/cld3/generated_ulscript.h +142 -0
data/ext/cld3/getonescriptspan.cc +1109 -0
data/ext/cld3/getonescriptspan.h +124 -0
data/ext/cld3/integral_types.h +37 -0
data/ext/cld3/lang_id_nn_params.cc +57449 -0
data/ext/cld3/lang_id_nn_params.h +178 -0
data/ext/cld3/language_identifier_features.cc +165 -0
data/ext/cld3/language_identifier_features.h +116 -0
data/ext/cld3/nnet_language_identifier.cc +380 -0
data/ext/cld3/nnet_language_identifier.h +175 -0
data/ext/cld3/nnet_language_identifier_c.cc +72 -0
data/ext/cld3/offsetmap.cc +478 -0
data/ext/cld3/offsetmap.h +168 -0
data/ext/cld3/port.h +143 -0
data/ext/cld3/registry.cc +28 -0
data/ext/cld3/registry.h +242 -0
data/ext/cld3/relevant_script_feature.cc +89 -0
data/ext/cld3/relevant_script_feature.h +49 -0
data/ext/cld3/script_detector.h +156 -0
data/ext/cld3/sentence.proto +77 -0
data/ext/cld3/sentence_features.cc +29 -0
data/ext/cld3/sentence_features.h +35 -0
data/ext/cld3/simple_adder.h +72 -0
data/ext/cld3/stringpiece.h +81 -0
data/ext/cld3/task_context.cc +161 -0
data/ext/cld3/task_context.h +81 -0
data/ext/cld3/task_context_params.cc +74 -0
data/ext/cld3/task_context_params.h +54 -0
data/ext/cld3/task_spec.proto +98 -0
data/ext/cld3/text_processing.cc +245 -0
data/ext/cld3/text_processing.h +30 -0
data/ext/cld3/unicodetext.cc +96 -0
data/ext/cld3/unicodetext.h +144 -0
data/ext/cld3/utf8acceptinterchange.h +486 -0
data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
data/ext/cld3/utf8repl_lettermarklower.h +758 -0
data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
data/ext/cld3/utf8statetable.cc +1344 -0
data/ext/cld3/utf8statetable.h +285 -0
data/ext/cld3/utils.cc +241 -0
data/ext/cld3/utils.h +144 -0
data/ext/cld3/workspace.cc +64 -0
data/ext/cld3/workspace.h +177 -0
data/lib/cld3.rb +99 -0
metadata +158 -0

data/ext/cld3/utils.h ADDED

@@ -0,0 +1,144 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef UTILS_H_
+#define UTILS_H_
+#include <stddef.h>
+#include <functional>
+#include <initializer_list>
+#include <string>
+#include <vector>
+#include "base.h"
+#include "script_span/stringpiece.h"
+namespace chrome_lang_id {
+namespace utils {
+bool ParseInt32(const char *c_str, int *value);
+bool ParseDouble(const char *c_str, double *value);
+template <typename T>
+T ParseUsing(const string &str, std::function<bool(const char *, T *)> func) {
+  T value;
+  func(str.c_str(), &value);
+  return value;
+}
+template <typename T>
+T ParseUsing(const string &str, T defval,
+             std::function<bool(const char *, T *)> func) {
+  return str.empty() ? defval : ParseUsing<T>(str, func);
+}
+string CEscape(const string &src);
+std::vector<string> Split(const string &text, char delim);
+int RemoveLeadingWhitespace(StringPiece *text);
+int RemoveTrailingWhitespace(StringPiece *text);
+int RemoveWhitespaceContext(StringPiece *text);
+uint32 Hash32(const char *data, size_t n, uint32 seed);
+uint32 Hash32WithDefaultSeed(const string &input);
+// Deletes all the elements in an STL container and clears the container. This
+// function is suitable for use with a vector, set, hash_set, or any other STL
+// container which defines sensible begin(), end(), and clear() methods.
+// If container is NULL, this function is a no-op.
+template <typename T>
+void STLDeleteElements(T *container) {
+  if (!container) return;
+  auto it = container->begin();
+  while (it != container->end()) {
+    auto temp = it;
+    ++it;
+    delete *temp;
+  }
+  container->clear();
+}
+class PunctuationUtil {
+ public:
+  // Unicode character ranges for punctuation characters according to CoNLL.
+  struct CharacterRange {
+    int first;
+    int last;
+  };
+  static CharacterRange kPunctuation[];
+  // Returns true if Unicode character is a punctuation character.
+  static bool IsPunctuation(int u) {
+    int i = 0;
+    while (kPunctuation[i].first > 0) {
+      if (u < kPunctuation[i].first) return false;
+      if (u <= kPunctuation[i].last) return true;
+      ++i;
+    }
+    return false;
+  }
+  // Determine if tag is a punctuation tag.
+  static bool IsPunctuationTag(const string &tag) {
+    for (size_t i = 0; i < tag.length(); ++i) {
+      int c = tag[i];
+      if (c != ',' && c != ':' && c != '.' && c != '\'' && c != '`') {
+        return false;
+      }
+    }
+    return true;
+  }
+  // Returns true if tag is non-empty and has only punctuation or parens
+  // symbols.
+  static bool IsPunctuationTagOrParens(const string &tag) {
+    if (tag.empty()) return false;
+    for (size_t i = 0; i < tag.length(); ++i) {
+      int c = tag[i];
+      if (c != '(' && c != ')' && c != ',' && c != ':' && c != '.' &&
+          c != '\'' && c != '`') {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+void NormalizeDigits(string *form);
+// Takes a text and convert it into a vector, where each element is a utf8
+// character.
+void GetUTF8Chars(const string &text, std::vector<string> *chars);
+// Returns the number of bytes in the first UTF-8 char at the beginning
+// of the string. It is assumed that the string is valid UTF-8.  If
+// the first byte of the string is null, return 0 (for backwards
+// compatibility only; this use is discouraged).
+int UTF8FirstLetterNumBytes(const char *in_buf);
+// Returns the length (number of bytes) of the Unicode code point starting at
+// src, based on inspecting just that one byte.  Preconditions: src != NULL,
+// *src can be read, and *src is not '\0', and src points to a well-formed UTF-8
+// string.
+int OneCharLen(const char *src);
+}  // namespace utils
+}  // namespace chrome_lang_id
+#endif  // UTILS_H_

data/ext/cld3/workspace.cc ADDED

@@ -0,0 +1,64 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "workspace.h"
+#include "base.h"
+namespace chrome_lang_id {
+WorkspaceSet::WorkspaceSet() {}
+WorkspaceSet::~WorkspaceSet() { Reset(WorkspaceRegistry()); }
+WorkspaceRegistry::WorkspaceRegistry() {}
+WorkspaceRegistry::~WorkspaceRegistry() {}
+string WorkspaceRegistry::DebugString() const {
+  string str;
+  for (auto &it : workspace_names_) {
+    const string &type_name = workspace_types_.at(it.first);
+    for (size_t index = 0; index < it.second.size(); ++index) {
+      const string &workspace_name = it.second[index];
+      str += "\n  ";
+      str += type_name;
+      str += " :: ";
+      str += workspace_name;
+    }
+  }
+  return str;
+}
+VectorIntWorkspace::~VectorIntWorkspace() {}
+VectorIntWorkspace::VectorIntWorkspace(int size) : elements_(size) {}
+VectorIntWorkspace::VectorIntWorkspace(int size, int value)
+    : elements_(size, value) {}
+VectorIntWorkspace::VectorIntWorkspace(const std::vector<int> &elements)
+    : elements_(elements) {}
+string VectorIntWorkspace::TypeName() { return "Vector"; }
+VectorVectorIntWorkspace::~VectorVectorIntWorkspace() {}
+VectorVectorIntWorkspace::VectorVectorIntWorkspace(int size)
+    : elements_(size) {}
+string VectorVectorIntWorkspace::TypeName() { return "VectorVector"; }
+}  // namespace chrome_lang_id

data/ext/cld3/workspace.h ADDED

@@ -0,0 +1,177 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Notes on thread-safety: All of the classes here are thread-compatible.  More
+// specifically, the registry machinery is thread-safe, as long as each thread
+// performs feature extraction on a different Sentence object.
+#ifndef WORKSPACE_H_
+#define WORKSPACE_H_
+#include <stddef.h>
+#include <string>
+#include <typeindex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "base.h"
+namespace chrome_lang_id {
+// A base class for shared workspaces. Derived classes implement a static member
+// function TypeName() which returns a human readable string name for the class.
+class Workspace {
+ public:
+  // Polymorphic destructor.
+  virtual ~Workspace() {}
+ protected:
+  // Create an empty workspace.
+  Workspace() {}
+ private:
+  CLD3_DISALLOW_COPY_AND_ASSIGN(Workspace);
+};
+// A registry that keeps track of workspaces.
+class WorkspaceRegistry {
+ public:
+  // Create an empty registry.
+  WorkspaceRegistry();
+  ~WorkspaceRegistry();
+  const std::unordered_map<std::type_index, std::vector<std::string>>
+      &WorkspaceNames() const {
+    return workspace_names_;
+  }
+  // Returns a string describing the registered workspaces.
+  string DebugString() const;
+ private:
+  // Workspace type names, indexed as workspace_types_[typeid].
+  std::unordered_map<std::type_index, string> workspace_types_;
+  // Workspace names, indexed as workspace_names_[typeid][workspace].
+  std::unordered_map<std::type_index, std::vector<string>> workspace_names_;
+  CLD3_DISALLOW_COPY_AND_ASSIGN(WorkspaceRegistry);
+};
+// A typed collected of workspaces. The workspaces are indexed according to an
+// external WorkspaceRegistry. If the WorkspaceSet is const, the contents are
+// also immutable.
+class WorkspaceSet {
+ public:
+  WorkspaceSet();
+  ~WorkspaceSet();
+  void Reset(const WorkspaceRegistry &registry) {
+    // Deallocate current workspaces.
+    for (auto &it : workspaces_) {
+      for (size_t index = 0; index < it.second.size(); ++index) {
+        delete it.second[index];
+      }
+    }
+    workspaces_.clear();
+    // Allocate space for new workspaces.
+    for (auto &it : registry.WorkspaceNames()) {
+      workspaces_[it.first].resize(it.second.size());
+    }
+  }
+ private:
+  // The set of workspaces, indexed as workspaces_[typeid][index].
+  std::unordered_map<std::type_index, std::vector<Workspace *>> workspaces_;
+};
+// A workspace that wraps around a single int.
+class SingletonIntWorkspace : public Workspace {
+ public:
+  // Default-initializes the int value.
+  SingletonIntWorkspace() {}
+  // Initializes the int with the given value.
+  explicit SingletonIntWorkspace(int value) : value_(value) {}
+  // Returns the name of this type of workspace.
+  static string TypeName() { return "SingletonInt"; }
+  // Returns the int value.
+  int get() const { return value_; }
+  // Sets the int value.
+  void set(int value) { value_ = value; }
+ private:
+  // The enclosed int.
+  int value_ = 0;
+};
+// A workspace that wraps around a vector of int.
+class VectorIntWorkspace : public Workspace {
+ public:
+  // Creates a vector of the given size.
+  explicit VectorIntWorkspace(int size);
+  // Creates a vector initialized with the given array.
+  explicit VectorIntWorkspace(const std::vector<int> &elements);
+  // Creates a vector of the given size, with each element initialized to the
+  // given value.
+  VectorIntWorkspace(int size, int value);
+  ~VectorIntWorkspace() override;
+  // Returns the name of this type of workspace.
+  static string TypeName();
+  // Returns the i'th element.
+  int element(int i) const { return elements_[i]; }
+  // Sets the i'th element.
+  void set_element(int i, int value) { elements_[i] = value; }
+ private:
+  // The enclosed vector.
+  std::vector<int> elements_;
+};
+// A workspace that wraps around a vector of vector of int.
+class VectorVectorIntWorkspace : public Workspace {
+ public:
+  // Creates a vector of empty vectors of the given size.
+  explicit VectorVectorIntWorkspace(int size);
+  ~VectorVectorIntWorkspace() override;
+  // Returns the name of this type of workspace.
+  static string TypeName();
+  // Returns the i'th vector of elements.
+  const std::vector<int> &elements(int i) const { return elements_[i]; }
+  // Mutable access to the i'th vector of elements.
+  std::vector<int> *mutable_elements(int i) { return &(elements_[i]); }
+ private:
+  // The enclosed vector of vector of elements.
+  std::vector<std::vector<int>> elements_;
+};
+}  // namespace chrome_lang_id
+#endif  // WORKSPACE_H_

data/lib/cld3.rb ADDED

@@ -0,0 +1,99 @@
+# File including an implementation of CLD3 module. Some documentations are
+# extracted from ext/cld3/ext/src/nnet_language_identifier.h.
+#
+# Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+require "ffi"
+# Module providing an interface for Compact Language Detector v3 (CLD3)
+module CLD3
+  # Class for detecting the language of a document.
+  class NNetLanguageIdentifier
+    # Min number of bytes needed to make a prediction if the construcotr is
+    # called without the corresponding parameter.
+    MIN_NUM_BYTES_TO_CONSIDER = 140
+    # Max number of bytes needed to make a prediction if the construcotr is
+    # called without the corresponding parameter.
+    MAX_NUM_BYTES_TO_CONSIDER = 700
+    # Max number of input bytes to process.
+    MAX_NUM_INPUT_BYTES_TO_CONSIDER = 10000
+    # Predictions with probability greater than or equal to this threshold are
+    # marked as reliable. This threshold was optimized on a set of text segments
+    # extracted from wikipedia, and results in an overall precision, recall,
+    # and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
+    RELIABILITY_THRESHOLD = 0.7
+    # Reliability threshold for the languages hr and bs.
+    RELIABILITY_HR_BS_THRESHOLD = 0.5
+    # Information about a predicted language.
+    Result = Struct.new("Result", :language, :probability, :reliable?, :proportion)
+    def initialize(minNumBytes = MIN_NUM_BYTES_TO_CONSIDER, maxNumBytes = MAX_NUM_BYTES_TO_CONSIDER)
+      @cc = Pointer.new(CLD3::Unstable.new_NNetLanguageIdentifier(minNumBytes, maxNumBytes))
+    end
+    # Finds the most likely language for the given text, along with additional
+    # information (e.g., probability). The prediction is based on the first N
+    # bytes where N is the minumum between the number of interchange valid UTF8
+    # bytes and max_num_bytes_. If N is less than min_num_bytes_ long, then this
+    # function returns nil.
+    def find_language(text)
+      text_utf8 = text.encode(Encoding::UTF_8)
+      pointer = FFI::MemoryPointer.new(:char, text_utf8.bytesize)
+      pointer.put_bytes(0, text_utf8)
+      cc_result = CLD3::Unstable.NNetLanguageIdentifier_find_language(@cc, pointer, text_utf8.bytesize)
+      language = cc_result[:language_data].read_bytes(cc_result[:language_size])
+      Result.new(
+          language == "und" ? nil : language,
+          cc_result[:probability],
+          cc_result[:reliable?],
+          cc_result[:proportion])
+    end
+    private
+    class Pointer < FFI::AutoPointer
+      def self.release(pointer)
+        CLD3::Unstable.delete_NNetLanguageIdentifier(pointer)
+      end
+    end
+  end
+  # Do NOT use this module from outside.
+  module Unstable
+    extend FFI::Library
+    ffi_lib File.join(File.expand_path(File.dirname(__FILE__)), "..", "ext", "cld3", FFI.map_library_name("cld3"))
+    class NNetLanguageIdentifierResult < FFI::Struct
+      layout :language_data, :pointer, :language_size, :size_t, :probability, :float, :proportion, :float, :reliable?, :bool
+    end
+    attach_function :delete_NNetLanguageIdentifier, [ :pointer ], :void
+    attach_function :new_NNetLanguageIdentifier, [ :int, :int ], :pointer
+    attach_function :NNetLanguageIdentifier_find_language,
+        [ :pointer, :buffer_in, :size_t ], NNetLanguageIdentifierResult.by_value
+  end
+end