RubyGems - cld3 - Versions diffs - 3.1.0 - Mend

cld3 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

checksums.yaml +7 -0
data/Gemfile +18 -0
data/LICENSE +204 -0
data/LICENSE_CLD3 +203 -0
data/README.md +22 -0
data/cld3.gemspec +35 -0
data/ext/cld3/base.cc +36 -0
data/ext/cld3/base.h +106 -0
data/ext/cld3/casts.h +98 -0
data/ext/cld3/embedding_feature_extractor.cc +51 -0
data/ext/cld3/embedding_feature_extractor.h +182 -0
data/ext/cld3/embedding_network.cc +196 -0
data/ext/cld3/embedding_network.h +186 -0
data/ext/cld3/embedding_network_params.h +285 -0
data/ext/cld3/extconf.rb +49 -0
data/ext/cld3/feature_extractor.cc +137 -0
data/ext/cld3/feature_extractor.h +633 -0
data/ext/cld3/feature_extractor.proto +50 -0
data/ext/cld3/feature_types.cc +72 -0
data/ext/cld3/feature_types.h +158 -0
data/ext/cld3/fixunicodevalue.cc +55 -0
data/ext/cld3/fixunicodevalue.h +69 -0
data/ext/cld3/float16.h +58 -0
data/ext/cld3/fml_parser.cc +308 -0
data/ext/cld3/fml_parser.h +123 -0
data/ext/cld3/generated_entities.cc +296 -0
data/ext/cld3/generated_ulscript.cc +678 -0
data/ext/cld3/generated_ulscript.h +142 -0
data/ext/cld3/getonescriptspan.cc +1109 -0
data/ext/cld3/getonescriptspan.h +124 -0
data/ext/cld3/integral_types.h +37 -0
data/ext/cld3/lang_id_nn_params.cc +57449 -0
data/ext/cld3/lang_id_nn_params.h +178 -0
data/ext/cld3/language_identifier_features.cc +165 -0
data/ext/cld3/language_identifier_features.h +116 -0
data/ext/cld3/nnet_language_identifier.cc +380 -0
data/ext/cld3/nnet_language_identifier.h +175 -0
data/ext/cld3/nnet_language_identifier_c.cc +72 -0
data/ext/cld3/offsetmap.cc +478 -0
data/ext/cld3/offsetmap.h +168 -0
data/ext/cld3/port.h +143 -0
data/ext/cld3/registry.cc +28 -0
data/ext/cld3/registry.h +242 -0
data/ext/cld3/relevant_script_feature.cc +89 -0
data/ext/cld3/relevant_script_feature.h +49 -0
data/ext/cld3/script_detector.h +156 -0
data/ext/cld3/sentence.proto +77 -0
data/ext/cld3/sentence_features.cc +29 -0
data/ext/cld3/sentence_features.h +35 -0
data/ext/cld3/simple_adder.h +72 -0
data/ext/cld3/stringpiece.h +81 -0
data/ext/cld3/task_context.cc +161 -0
data/ext/cld3/task_context.h +81 -0
data/ext/cld3/task_context_params.cc +74 -0
data/ext/cld3/task_context_params.h +54 -0
data/ext/cld3/task_spec.proto +98 -0
data/ext/cld3/text_processing.cc +245 -0
data/ext/cld3/text_processing.h +30 -0
data/ext/cld3/unicodetext.cc +96 -0
data/ext/cld3/unicodetext.h +144 -0
data/ext/cld3/utf8acceptinterchange.h +486 -0
data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
data/ext/cld3/utf8repl_lettermarklower.h +758 -0
data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
data/ext/cld3/utf8statetable.cc +1344 -0
data/ext/cld3/utf8statetable.h +285 -0
data/ext/cld3/utils.cc +241 -0
data/ext/cld3/utils.h +144 -0
data/ext/cld3/workspace.cc +64 -0
data/ext/cld3/workspace.h +177 -0
data/lib/cld3.rb +99 -0
metadata +158 -0

data/ext/cld3/task_spec.proto ADDED

@@ -0,0 +1,98 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// LINT: ALLOW_GROUPS
+// Protocol buffer specifications for task configuration.
+syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
+package chrome_lang_id;
+// Task input descriptor.
+message TaskInput {
+  // Name of input resource.
+  required string name = 1;
+  // Name of stage responsible of creating this resource.
+  optional string creator = 2;
+  // File format for resource.
+  repeated string file_format = 3;
+  // Record format for resource.
+  repeated string record_format = 4;
+  // Is this resource multi-file?
+  optional bool multi_file = 5 [default = false];
+  // An input can consist of multiple file sets.
+  repeated group Part = 6 {
+    // File pattern for file set.
+    optional string file_pattern = 7;
+    // File format for file set.
+    optional string file_format = 8;
+    // Record format for file set.
+    optional string record_format = 9;
+  }
+}
+// Task output descriptor.
+message TaskOutput {
+  // Name of output resource.
+  required string name = 1;
+  // File format for output resource.
+  optional string file_format = 2;
+  // Record format for output resource.
+  optional string record_format = 3;
+  // Number of shards in output. If it is different from zero this output is
+  // sharded. If the number of shards is set to -1 this means that the output is
+  // sharded, but the number of shard is unknown. The files are then named
+  // 'base-*-of-*'.
+  optional int32 shards = 4 [default = 0];
+  // Base file name for output resource. If this is not set by the task
+  // component it is set to a default value by the workflow engine.
+  optional string file_base = 5;
+  // Optional extension added to the file name.
+  optional string file_extension = 6;
+}
+// A task specification is used for describing executing parameters.
+message TaskSpec {
+  // Name of task.
+  optional string task_name = 1;
+  // Workflow task type.
+  optional string task_type = 2;
+  // Task parameters.
+  repeated group Parameter = 3 {
+    required string name = 4;
+    optional string value = 5;
+  }
+  // Task inputs.
+  repeated TaskInput input = 6;
+  // Task outputs.
+  repeated TaskOutput output = 7;
+}

data/ext/cld3/text_processing.cc ADDED

@@ -0,0 +1,245 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "text_processing.h"
+#include <stdio.h>
+#include <string.h>
+namespace chrome_lang_id {
+namespace CLD2 {
+namespace {
+static const int kMaxSpaceScan = 32;  // Bytes
+int minint(int a, int b) { return (a < b) ? a : b; }
+// Counts number of spaces; a little faster than one-at-a-time
+// Doesn't count odd bytes at end
+int CountSpaces4(const char *src, int src_len) {
+  int s_count = 0;
+  for (int i = 0; i < (src_len & ~3); i += 4) {
+    s_count += (src[i] == ' ');
+    s_count += (src[i + 1] == ' ');
+    s_count += (src[i + 2] == ' ');
+    s_count += (src[i + 3] == ' ');
+  }
+  return s_count;
+}
+// This uses a cheap predictor to get a measure of compression, and
+// hence a measure of repetitiveness. It works on complete UTF-8 characters
+// instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
+// all the time when done with a byte-based count. Sigh.
+//
+// To allow running prediction across multiple chunks, caller passes in current
+// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
+//
+// Returns the number of *bytes* correctly predicted, increments by 1..4 for
+// each correctly-predicted character.
+//
+// NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
+//
+// TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
+int CountPredictedBytes(const char *isrc, int src_len, int *hash, int *tbl) {
+  typedef unsigned char uint8;
+  int p_count = 0;
+  const uint8 *src = reinterpret_cast<const uint8 *>(isrc);
+  const uint8 *srclimit = src + src_len;
+  int local_hash = *hash;
+  while (src < srclimit) {
+    int c = src[0];
+    int incr = 1;
+    // Pick up one char and length
+    if (c < 0xc0) {
+      // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
+      // Do nothing more
+    } else if ((c & 0xe0) == 0xc0) {
+      // Two-byte
+      c = (c << 8) | src[1];
+      incr = 2;
+    } else if ((c & 0xf0) == 0xe0) {
+      // Three-byte
+      c = (c << 16) | (src[1] << 8) | src[2];
+      incr = 3;
+    } else {
+      // Four-byte
+      c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
+      incr = 4;
+    }
+    src += incr;
+    int p = tbl[local_hash];  // Prediction
+    tbl[local_hash] = c;      // Update prediction
+    if (c == p) {
+      p_count += incr;  // Count bytes of good predictions
+    }
+    local_hash = ((local_hash << 4) ^ c) & 0xfff;
+  }
+  *hash = local_hash;
+  return p_count;
+}
+// Backscan to word boundary, returning how many bytes n to go back
+// so that src - n is non-space ans src - n - 1 is space.
+// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
+int BackscanToSpace(const char *src, int limit) {
+  int n = 0;
+  limit = minint(limit, kMaxSpaceScan);
+  while (n < limit) {
+    if (src[-n - 1] == ' ') {
+      return n;
+    }  // We are at _X
+    ++n;
+  }
+  n = 0;
+  while (n < limit) {
+    if ((src[-n] & 0xc0) != 0x80) {
+      return n;
+    }  // We are at char begin
+    ++n;
+  }
+  return 0;
+}
+// Forwardscan to word boundary, returning how many bytes n to go forward
+// so that src + n is non-space ans src + n - 1 is space.
+// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
+int ForwardscanToSpace(const char *src, int limit) {
+  int n = 0;
+  limit = minint(limit, kMaxSpaceScan);
+  while (n < limit) {
+    if (src[n] == ' ') {
+      return n + 1;
+    }  // We are at _X
+    ++n;
+  }
+  n = 0;
+  while (n < limit) {
+    if ((src[n] & 0xc0) != 0x80) {
+      return n;
+    }  // We are at char begin
+    ++n;
+  }
+  return 0;
+}
+}  // namespace
+// Must be exactly 4096 for cheap compressor.
+static const int kPredictionTableSize = 4096;
+static const int kChunksizeDefault = 48;      // Squeeze 48-byte chunks
+static const int kSpacesThreshPercent = 30;   // Squeeze if >=30% spaces
+static const int kPredictThreshPercent = 40;  // Squeeze if >=40% predicted
+// Remove portions of text that have a high density of spaces, or that are
+// overly repetitive, squeezing the remaining text in-place to the front of the
+// input buffer.
+//
+// Squeezing looks at density of space/prediced chars in fixed-size chunks,
+// specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
+//
+// Return the new, possibly-shorter length
+//
+// Result Buffer ALWAYS has leading space and trailing space space space NUL,
+// if input does
+//
+int CheapSqueezeInplace(char *isrc, int src_len, int ichunksize) {
+  char *src = isrc;
+  char *dst = src;
+  char *srclimit = src + src_len;
+  bool skipping = false;
+  int hash = 0;
+  // Allocate local prediction table.
+  int *predict_tbl = new int[kPredictionTableSize];
+  memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
+  int chunksize = ichunksize;
+  if (chunksize == 0) {
+    chunksize = kChunksizeDefault;
+  }
+  int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
+  int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
+  while (src < srclimit) {
+    int remaining_bytes = srclimit - src;
+    int len = minint(chunksize, remaining_bytes);
+    // Make len land us on a UTF-8 character boundary.
+    // Ah. Also fixes mispredict because we could get out of phase
+    // Loop always terminates at trailing space in buffer
+    while ((src[len] & 0xc0) == 0x80) {
+      ++len;
+    }  // Move past continuation bytes
+    int space_n = CountSpaces4(src, len);
+    int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
+    if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
+      // Skip the text
+      if (!skipping) {
+        // Keeping-to-skipping transition; do it at a space
+        int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
+        dst -= n;
+        if (dst == isrc) {
+          // Force a leading space if the first chunk is deleted
+          *dst++ = ' ';
+        }
+        skipping = true;
+      }
+    } else {
+      // Keep the text
+      if (skipping) {
+        // Skipping-to-keeping transition; do it at a space
+        int n = ForwardscanToSpace(src, len);
+        src += n;
+        remaining_bytes -= n;  // Shrink remaining length
+        len -= n;
+        skipping = false;
+      }
+      // "len" can be negative in some cases
+      if (len > 0) {
+        memmove(dst, src, len);
+        dst += len;
+      }
+    }
+    src += len;
+  }
+  if ((dst - isrc) < (src_len - 3)) {
+    // Pad and make last char clean UTF-8 by putting following spaces
+    dst[0] = ' ';
+    dst[1] = ' ';
+    dst[2] = ' ';
+    dst[3] = '\0';
+  } else if ((dst - isrc) < src_len) {
+    // Make last char clean UTF-8 by putting following space off the end
+    dst[0] = ' ';
+  }
+  // Deallocate local prediction table
+  delete[] predict_tbl;
+  return static_cast<int>(dst - isrc);
+}
+}  // namespace CLD2
+}  // namespace chrome_lang_id

data/ext/cld3/text_processing.h ADDED

@@ -0,0 +1,30 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef SCRIPT_SPAN_TEXT_PROCESSING_H_
+#define SCRIPT_SPAN_TEXT_PROCESSING_H_
+namespace chrome_lang_id {
+namespace CLD2 {
+// Remove portions of text that have a high density of spaces, or that are
+// overly repetitive, squeezing the remaining text in-place to the front
+// of the input buffer.
+// Return the new, possibly-shorter length
+int CheapSqueezeInplace(char *isrc, int srclen, int ichunksize);
+}  // namespace CLD2
+}  // namespace chrome_lang_id
+#endif  // SCRIPT_SPAN_TEXT_PROCESSING_H_

data/ext/cld3/unicodetext.cc ADDED

@@ -0,0 +1,96 @@
+// Copyright (C) 2006 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Author: Jim Meehan
+#include "unicodetext.h"
+#include "base.h"
+#include "utils.h"
+namespace chrome_lang_id {
+// *************** Data representation **********
+// Note: the copy constructor is undefined.
+void UnicodeText::Repr::PointTo(const char *data, int size) {
+  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
+  data_ = const_cast<char *>(data);
+  size_ = size;
+  capacity_ = size;
+  ours_ = false;
+}
+// *************** UnicodeText ******************
+UnicodeText::UnicodeText() {}
+UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {
+  repr_.PointTo(buffer, byte_length);
+  return *this;
+}
+UnicodeText::~UnicodeText() {}
+// ******************* UnicodeText::const_iterator *********************
+// The implementation of const_iterator would be nicer if it
+// inherited from boost::iterator_facade
+// (http://boost.org/libs/iterator/doc/iterator_facade.html).
+UnicodeText::const_iterator::const_iterator() : it_(0) {}
+UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(
+    const const_iterator &other) {
+  if (&other != this) it_ = other.it_;
+  return *this;
+}
+UnicodeText::const_iterator UnicodeText::begin() const {
+  return const_iterator(repr_.data_);
+}
+UnicodeText::const_iterator UnicodeText::end() const {
+  return const_iterator(repr_.data_ + repr_.size_);
+}
+char32 UnicodeText::const_iterator::operator*() const {
+  // (We could call chartorune here, but that does some
+  // error-checking, and we're guaranteed that our data is valid
+  // UTF-8. Also, we expect this routine to be called very often. So
+  // for speed, we do the calculation ourselves.)
+  // Convert from UTF-8
+  unsigned char byte1 = static_cast<unsigned char>(it_[0]);
+  if (byte1 < 0x80) return byte1;
+  unsigned char byte2 = static_cast<unsigned char>(it_[1]);
+  if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
+  unsigned char byte3 = static_cast<unsigned char>(it_[2]);
+  if (byte1 < 0xF0) {
+    return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
+  }
+  unsigned char byte4 = static_cast<unsigned char>(it_[3]);
+  return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
+         ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
+}
+UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {
+  it_ += chrome_lang_id::utils::OneCharLen(it_);
+  return *this;
+}
+}  // namespace chrome_lang_id