RubyGems - youtokentome - Versions diffs - 0.1.0 - Mend

youtokentome 0.1.0

Files changed (22) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/LICENSE.txt +22 -0
data/README.md +104 -0
data/ext/youtokentome/ext.cpp +135 -0
data/ext/youtokentome/extconf.rb +12 -0
data/lib/youtokentome.rb +10 -0
data/lib/youtokentome/bpe.rb +54 -0
data/lib/youtokentome/ext.bundle +0 -0
data/lib/youtokentome/version.rb +3 -0
data/vendor/YouTokenToMe/LICENSE +19 -0
data/vendor/YouTokenToMe/README.md +304 -0
data/vendor/YouTokenToMe/youtokentome/cpp/bpe.cpp +2185 -0
data/vendor/YouTokenToMe/youtokentome/cpp/bpe.h +86 -0
data/vendor/YouTokenToMe/youtokentome/cpp/third_party/LICENSE +23 -0
data/vendor/YouTokenToMe/youtokentome/cpp/third_party/flat_hash_map.h +1502 -0
data/vendor/YouTokenToMe/youtokentome/cpp/utf8.cpp +134 -0
data/vendor/YouTokenToMe/youtokentome/cpp/utf8.h +23 -0
data/vendor/YouTokenToMe/youtokentome/cpp/utils.cpp +119 -0
data/vendor/YouTokenToMe/youtokentome/cpp/utils.h +105 -0
data/vendor/YouTokenToMe/youtokentome/cpp/yttm.pyx +182 -0
metadata +133 -0

data/vendor/YouTokenToMe/youtokentome/cpp/utf8.cpp ADDED Viewed

@@ -0,0 +1,134 @@
+#include "utf8.h"
+#include <cassert>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "utils.h"
+namespace vkcom {
+using std::string;
+using std::vector;
+bool check_byte(char x) { return (static_cast<uint8_t>(x) & 0xc0u) == 0x80u; }
+bool check_codepoint(uint32_t x) {
+  return (x < 0xd800) || (0xdfff < x && x < 0x110000);
+}
+uint64_t utf_length(char ch) {
+  if ((static_cast<uint8_t>(ch) & 0x80u) == 0) {
+    return 1;
+  }
+  if ((static_cast<uint8_t>(ch) & 0xe0u) == 0xc0) {
+    return 2;
+  }
+  if ((static_cast<uint8_t>(ch) & 0xf0u) == 0xe0) {
+    return 3;
+  }
+  if ((static_cast<uint8_t>(ch) & 0xf8u) == 0xf0) {
+    return 4;
+  }
+  // Invalid utf-8
+  return 0;
+}
+uint32_t chars_to_utf8(const char* begin, uint64_t size, uint64_t* utf8_len) {
+  uint64_t length = utf_length(begin[0]);
+  if (length == 1) {
+    *utf8_len = 1;
+    return static_cast<uint8_t>(begin[0]);
+  }
+  uint32_t code_point = 0;
+  if (size >= 2 && length == 2 && check_byte(begin[1])) {
+    code_point += (static_cast<uint8_t>(begin[0]) & 0x1fu) << 6u;
+    code_point += (static_cast<uint8_t>(begin[1]) & 0x3fu);
+    if (code_point >= 0x0080 && check_codepoint(code_point)) {
+      *utf8_len = 2;
+      return code_point;
+    }
+  } else if (size >= 3 && length == 3 && check_byte(begin[1]) &&
+             check_byte(begin[2])) {
+    code_point += (static_cast<uint8_t>(begin[0]) & 0x0fu) << 12u;
+    code_point += (static_cast<uint8_t>(begin[1]) & 0x3fu) << 6u;
+    code_point += (static_cast<uint8_t>(begin[2]) & 0x3fu);
+    if (code_point >= 0x0800 && check_codepoint(code_point)) {
+      *utf8_len = 3;
+      return code_point;
+    }
+  } else if (size >= 4 && length == 4 && check_byte(begin[1]) &&
+             check_byte(begin[2]) && check_byte(begin[3])) {
+    code_point += (static_cast<uint8_t>(begin[0]) & 0x07u) << 18u;
+    code_point += (static_cast<uint8_t>(begin[1]) & 0x3fu) << 12u;
+    code_point += (static_cast<uint8_t>(begin[2]) & 0x3fu) << 6u;
+    code_point += (static_cast<uint8_t>(begin[3]) & 0x3fu);
+    if (code_point >= 0x10000 && check_codepoint(code_point)) {
+      *utf8_len = 4;
+      return code_point;
+    }
+  }
+  // Invalid utf-8
+  *utf8_len = 1;
+  return INVALID_UNICODE;
+}
+void utf8_to_chars(uint32_t x, std::back_insert_iterator<string> it) {
+  assert(check_codepoint(x));
+  if (x <= 0x7f) {
+    *(it++) = x;
+    return;
+  }
+  if (x <= 0x7ff) {
+    *(it++) = 0xc0u | (x >> 6u);
+    *(it++) = 0x80u | (x & 0x3fu);
+    return;
+  }
+  if (x <= 0xffff) {
+    *(it++) = 0xe0u | (x >> 12u);
+    *(it++) = 0x80u | ((x >> 6u) & 0x3fu);
+    *(it++) = 0x80u | (x & 0x3fu);
+    return;
+  }
+  *(it++) = 0xf0u | (x >> 18u);
+  *(it++) = 0x80u | ((x >> 12u) & 0x3fu);
+  *(it++) = 0x80u | ((x >> 6u) & 0x3fu);
+  *(it++) = 0x80u | (x & 0x3fu);
+}
+string encode_utf8(const vector<uint32_t>& text) {
+  string utf8_text;
+  for (const uint32_t c : text) {
+    utf8_to_chars(c, std::back_inserter(utf8_text));
+  }
+  return utf8_text;
+}
+vector<uint32_t> decode_utf8(const char* begin, const char* end) {
+  vector<uint32_t> decoded_text;
+  uint64_t utf8_len = 0;
+  bool invalid_input = false;
+  for (; begin < end; begin += utf8_len) {
+    uint32_t code_point = chars_to_utf8(begin, end - begin, &utf8_len);
+    if (code_point != INVALID_UNICODE) {
+      decoded_text.push_back(code_point);
+    } else {
+      invalid_input = true;
+    }
+  }
+  if (invalid_input) {
+    std::cerr << "WARNING Input contains invalid unicode characters."
+              << std::endl;
+  }
+  return decoded_text;
+}
+vector<uint32_t> decode_utf8(const string& utf8_text) {
+  return decode_utf8(utf8_text.data(), utf8_text.data() + utf8_text.size());
+}
+}  // namespace vkcom

data/vendor/YouTokenToMe/youtokentome/cpp/utf8.h ADDED Viewed

@@ -0,0 +1,23 @@
+#pragma once
+#include "utils.h"
+namespace vkcom {
+constexpr static uint32_t INVALID_UNICODE = 0x0fffffff;
+uint32_t chars_to_utf8(const char* begin, uint64_t size, uint64_t* utf8_len);
+std::string encode_utf8(const std::vector<uint32_t> &utext);
+std::vector<uint32_t> decode_utf8(const char *begin, const char *end);
+std::vector<uint32_t> decode_utf8(const std::string &utf8_text);
+} // namespace vkcom

data/vendor/YouTokenToMe/youtokentome/cpp/utils.cpp ADDED Viewed

@@ -0,0 +1,119 @@
+#include "utils.h"
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+namespace vkcom {
+using std::string;
+using std::vector;
+void SpecialTokens::dump(std::ofstream &fout) {
+  fout << unk_id << " " << pad_id << " " << bos_id << " " << eos_id
+       << std::endl;
+}
+void SpecialTokens::load(std::ifstream &fin) {
+  fin >> unk_id >> pad_id >> bos_id >> eos_id;
+}
+uint32_t SpecialTokens::max_id() const {
+  int ret = 0;
+  ret = std::max(ret, unk_id);
+  ret = std::max(ret, pad_id);
+  ret = std::max(ret, bos_id);
+  ret = std::max(ret, eos_id);
+  return ret;
+}
+bool SpecialTokens::taken_id(int id) const {
+  return id == unk_id || id == pad_id || id == bos_id || id == eos_id;
+}
+uint64_t SpecialTokens::n_special_tokens() const {
+  uint64_t cnt = 0;
+  cnt += (unk_id != -1);
+  cnt += (pad_id != -1);
+  cnt += (bos_id != -1);
+  cnt += (eos_id != -1);
+  return cnt;
+}
+SpecialTokens::SpecialTokens(int pad_id, int unk_id, int bos_id, int eos_id)
+    : pad_id(pad_id), unk_id(unk_id), bos_id(bos_id), eos_id(eos_id) {}
+bool BPE_Rule::operator==(const BPE_Rule &other) const {
+  return x == other.x && y == other.y && z == other.z;
+}
+BPE_Rule::BPE_Rule(uint32_t x, uint32_t y, uint32_t z) : x(x), y(y), z(z) {}
+void BPEState::dump(const string &file_name) {
+  std::ofstream fout(file_name, std::ios::out);
+  if (fout.fail()) {
+    std::cerr << "Can't open file: " << file_name << std::endl;
+    assert(false);
+  }
+  fout << char2id.size() << " " << rules.size() << std::endl;
+  for (auto s : char2id) {
+    fout << s.first << " " << s.second << std::endl;
+  }
+  for (auto rule : rules) {
+    fout << rule.x << " " << rule.y << " " << rule.z << std::endl;
+  }
+  special_tokens.dump(fout);
+  fout.close();
+}
+Status BPEState::load(const string &file_name) {
+  char2id.clear();
+  rules.clear();
+  std::ifstream fin(file_name, std::ios::in);
+  if (fin.fail()) {
+    return Status(1, "Can not open file with model: " + file_name);
+  }
+  int n, m;
+  fin >> n >> m;
+  for (int i = 0; i < n; i++) {
+    uint32_t inner_id;
+    uint32_t utf32_id;
+    fin >> inner_id >> utf32_id;
+    char2id[inner_id] = utf32_id;
+  }
+  for (int i = 0; i < m; i++) {
+    uint32_t x, y, z;
+    fin >> x >> y >> z;
+    rules.emplace_back(x, y, z);
+  }
+  special_tokens.load(fin);
+  fin.close();
+  return Status();
+}
+BpeConfig::BpeConfig(double _character_coverage, int _n_threads,
+                     const SpecialTokens &_special_tokens)
+    : character_coverage(_character_coverage),
+      n_threads(_n_threads),
+      special_tokens(_special_tokens) {}
+vector<string> read_lines_from_stdin(uint64_t batch_limit, uint64_t *processed) {
+  vector<string> sentences;
+  string s;
+  while (*processed < batch_limit && getline(std::cin, s)) {
+    *processed += s.size();
+    sentences.push_back(std::move(s));
+  }
+  return sentences;
+}
+Status::Status(int code, std::string message) : code(code), message(std::move(message)) {}
+const std::string &Status::error_message() const {
+  return message;
+}
+bool Status::ok() const {
+  return code == 0;
+}
+}  // namespace vkcom

data/vendor/YouTokenToMe/youtokentome/cpp/utils.h ADDED Viewed

@@ -0,0 +1,105 @@
+#pragma once
+#include <iostream>
+#include <string>
+#include <vector>
+#include "third_party/flat_hash_map.h"
+namespace vkcom {
+const uint32_t SPACE_TOKEN = 9601;
+struct BPE_Rule {
+  // x + y -> z
+  uint32_t x{0};
+  uint32_t y{0};
+  uint32_t z{0};
+  BPE_Rule() = default;
+  BPE_Rule(uint32_t x, uint32_t y, uint32_t z);
+  bool operator==(const BPE_Rule &other) const;
+};
+struct SpecialTokens {
+  int pad_id = -1;
+  int unk_id = -1;
+  int bos_id = -1;
+  int eos_id = -1;
+  SpecialTokens() = default;
+  SpecialTokens(int pad_id, int unk_id, int bos_id, int eos_id);
+  void dump(std::ofstream &fout);
+  void load(std::ifstream &fin);
+  uint32_t max_id() const;
+  bool taken_id(int id) const;
+  uint64_t n_special_tokens() const;
+};
+struct BpeConfig {
+  double character_coverage = 1;
+  int n_threads = 0;
+  SpecialTokens special_tokens;
+  BpeConfig() = default;
+  BpeConfig(double character_coverage, int n_threads,
+            const SpecialTokens &special_tokens);
+};
+struct Status {
+  int code{0};
+  std::string message;
+  Status() = default;
+  Status(int code, std::string message);
+  const std::string &error_message() const;
+  bool ok() const;
+};
+struct BPEState {
+  flat_hash_map<uint32_t, uint32_t> char2id;
+  std::vector<BPE_Rule> rules;
+  SpecialTokens special_tokens;
+  void dump(const std::string &file_name);
+  Status load(const std::string &file_name);
+};
+struct DecodeResult {
+  std::vector<int> ids;
+  std::vector<std::string> pieces;
+};
+struct EncodingConfig {
+  bool bos;
+  bool eos;
+  bool reverse;
+  double dropout_prob;
+};
+bool is_space(uint32_t ch);
+std::vector<std::string> read_lines_from_stdin(uint64_t batch_limit, uint64_t *processed);
+template<typename T>
+void write_to_stdout(const std::vector<std::vector<T>> &sentences, bool flush) {
+  for (const auto &sentence : sentences) {
+    for (const auto &token : sentence) {
+      std::cout << token << " ";
+    }
+    std::cout << "\n";
+  }
+  if (flush) {
+    std::cout << std::flush;
+  }
+}
+}  // namespace vkcom

data/vendor/YouTokenToMe/youtokentome/cpp/yttm.pyx ADDED Viewed

@@ -0,0 +1,182 @@
+from libcpp.vector cimport vector
+from libcpp.unordered_set cimport unordered_set
+from libcpp.string cimport string
+from libcpp cimport bool
+import os
+from pathlib import Path
+from typing import Collection
+cdef extern from "bpe.h" namespace "vkcom":
+    cdef cppclass SpecialTokens:
+        int pad_id
+        int unk_id
+        int bos_id
+        int eos_id
+    cdef cppclass BpeConfig:
+        double character_coverage
+        int n_threads
+        SpecialTokens special_tokens
+    cdef cppclass Status:
+        int code
+        string message
+cdef extern from "bpe.h" namespace "vkcom":
+    Status train_bpe(const string &source_path, const string& model_path, int vocab_size, const BpeConfig& bpe_config)
+cdef extern from "bpe.h" namespace "vkcom":
+    cdef cppclass BaseEncoder:
+        BaseEncoder(const string& model_path, int n_threads, Status* status)
+        Status encode_as_ids(const vector[string] &sentences, vector[vector[int]]* ids, bool bos, bool eos, bool reverse, double dropout_prob) const
+        Status encode_as_subwords(const vector[string]& sentences, vector[vector[string]]* subwords, bool bos, bool eos, bool reverse, double dropout_prob) const
+        Status encode_cli(string output_type, bool stream, bool bos, bool eos, bool reverse, double dropout_prob) const
+        Status decode_cli(const unordered_set[int]* ignore_ids) const
+        void vocab_cli(bool verbose) const
+        Status id_to_subword(int id, string* subword) const
+        int subword_to_id(const string &subword) const
+        Status decode(const vector[vector[int]]& ids, vector[string]* output, const unordered_set[int]* ignore_ids) const
+        int vocab_size() const
+        vector[string] vocabulary() const
+cdef class BPE:
+    cdef BaseEncoder* encoder
+    def __dealloc__(self):
+        del self.encoder
+    def __init__(self, model_path, n_threads=-1):
+        cdef Status status
+        self.encoder = new BaseEncoder(model_path.encode(), n_threads, &status)
+        if status.code != 0:
+            raise ValueError(status.message.decode())
+    @staticmethod
+    def train(data,
+              model,
+              vocab_size,
+              coverage=1.0,
+              n_threads=-1,
+              pad_id=0,
+              unk_id=1,
+              bos_id=2,
+              eos_id=3):
+        cdef BpeConfig bpe_config
+        bpe_config.character_coverage = coverage
+        bpe_config.n_threads = n_threads
+        bpe_config.special_tokens.pad_id = pad_id
+        bpe_config.special_tokens.unk_id = unk_id
+        bpe_config.special_tokens.bos_id = bos_id
+        bpe_config.special_tokens.eos_id = eos_id
+        cdef Status status = train_bpe(data.encode(), model.encode(), vocab_size, bpe_config)
+        if status.code != 0:
+            raise ValueError(status.message.decode())
+    def encode(self, sentences, output_type, bos, eos, reverse, dropout_prob):
+        cdef vector[string] s
+        cdef vector[vector[string]] ret_subwords
+        cdef vector[vector[int]] ret_ids
+        cdef Status status
+        if dropout_prob < 0 or dropout_prob > 1:
+            raise ValueError("dropout_prob value must be in the range [0, 1]. Current value of dropout_prob = " + str(dropout_prob))
+        if output_type == 'id':
+            if isinstance(sentences, str):
+                s = [sentences.encode()]
+                status = self.encoder.encode_as_ids(s, &ret_ids, bos, eos, reverse, dropout_prob)
+                if status.code != 0:
+                    raise ValueError(status.message.decode())
+                return ret_ids[0]
+            assert isinstance(sentences, list) or isinstance(sentences, tuple)
+            s = [x.encode() for x in sentences]
+            status = self.encoder.encode_as_ids(s, &ret_ids, bos, eos, reverse, dropout_prob)
+            if status.code != 0:
+                raise ValueError(status.message.decode())
+            return ret_ids
+        elif output_type == 'subword':
+            if isinstance(sentences, str):
+                s = [sentences.encode()]
+                status = self.encoder.encode_as_subwords(s, &ret_subwords, bos, eos, reverse, dropout_prob)
+                if status.code != 0:
+                    raise ValueError(status.message.decode())
+                assert len(ret_subwords) == 1
+                return [piece.decode() for piece in ret_subwords[0]]
+            assert isinstance(sentences, list) or isinstance(sentences, tuple)
+            s = [x.encode() for x in sentences]
+            status = self.encoder.encode_as_subwords(s, &ret_subwords, bos, eos, reverse, dropout_prob)
+            if status.code != 0:
+                raise ValueError(status.message.decode())
+            return [[piece.decode() for piece in sentence] for sentence in ret_subwords]
+        else:
+            raise ValueError('output_type must be equal to "id" or "subword"')
+    def subword_to_id(self, subword):
+        return self.encoder.subword_to_id(subword.encode())
+    def id_to_subword(self, id):
+        cdef string subword
+        cdef Status status = self.encoder.id_to_subword(id, &subword)
+        if status.code != 0:
+            raise ValueError(status.message.decode())
+        return subword.decode()
+    def decode(self, ids, ignore_ids):
+        if not isinstance(ids, list):
+            raise TypeError(
+                "{} is not a list instance".format(type(ids))
+            )
+        if not isinstance(ignore_ids, Collection) and ignore_ids is not None:
+            raise TypeError(
+                "{} is not a Collection instance".format(type(ignore_ids))
+            )
+        if len(ids) > 0 and isinstance(ids[0], int):
+            ids = [ids]
+        if ignore_ids is None:
+            ignore_ids = set()
+        cdef vector[string] sentences
+        cdef unordered_set[int] c_ignore_ids = unordered_set[int](ignore_ids)
+        cdef Status status = self.encoder.decode(ids, &sentences, &c_ignore_ids)
+        if status.code != 0:
+            raise ValueError(status.message.decode())
+        return [sentence.decode() for sentence in sentences]
+    def vocab_size(self):
+        return self.encoder.vocab_size();
+    def vocab(self):
+        cdef vector[string] vocab = self.encoder.vocabulary()
+        return [token.decode() for token in vocab]
+    def encode_cli(self, output_type, stream, bos, eos, reverse, dropout_prob):
+        cdef Status status = self.encoder.encode_cli(output_type.encode(), stream, bos, eos, reverse, dropout_prob)
+        if status.code != 0:
+            raise ValueError(status.message.decode())
+    def decode_cli(self, ignore_ids):
+        if ignore_ids is None:
+            ignore_ids = set()
+        cdef unordered_set[int] c_ignore_ids = unordered_set[int](ignore_ids)
+        cdef Status status = self.encoder.decode_cli(&c_ignore_ids)
+        if status.code != 0:
+            raise ValueError(status.message.decode())
+    def vocab_cli(self, verbose):
+        self.encoder.vocab_cli(verbose)