RubyGems - rbbt-dm - Versions diffs - 1.2.6 → 1.2.9 - Mend

rbbt-dm 1.2.6 → 1.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +4 -4
data/lib/rbbt/matrix/barcode.rb +2 -2
data/lib/rbbt/matrix/differential.rb +3 -3
data/lib/rbbt/matrix/knowledge_base.rb +1 -1
data/lib/rbbt/plots/bar.rb +1 -1
data/lib/rbbt/stan.rb +1 -1
data/lib/rbbt/statistics/hypergeometric.rb +2 -1
data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
data/lib/rbbt/vector/model/huggingface.rb +57 -38
data/lib/rbbt/vector/model/pytorch_lightning.rb +35 -0
data/lib/rbbt/vector/model/random_forest.rb +1 -1
data/lib/rbbt/vector/model/spaCy.rb +8 -14
data/lib/rbbt/vector/model/tensorflow.rb +6 -5
data/lib/rbbt/vector/model/torch.rb +37 -0
data/lib/rbbt/vector/model/util.rb +18 -0
data/lib/rbbt/vector/model.rb +100 -56
data/python/rbbt_dm/__init__.py +48 -1
data/python/rbbt_dm/atcold/__init__.py +0 -0
data/python/rbbt_dm/atcold/plot_lib.py +141 -0
data/python/rbbt_dm/atcold/spiral.py +27 -0
data/python/rbbt_dm/huggingface.py +57 -26
data/python/rbbt_dm/language_model.py +70 -0
data/python/rbbt_dm/util.py +30 -0
data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
data/test/rbbt/vector/model/test_huggingface.rb +258 -27
data/test/rbbt/vector/model/test_pytorch_lightning.rb +83 -0
data/test/rbbt/vector/model/test_spaCy.rb +1 -1
data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
data/test/rbbt/vector/test_model.rb +25 -26
data/test/test_helper.rb +13 -0
metadata +26 -16
data/lib/rbbt/tensorflow.rb +0 -43
data/lib/rbbt/vector/model/huggingface.old.rb +0 -160

data/lib/rbbt/vector/model.rb CHANGED Viewed

@@ -1,16 +1,25 @@
 require 'rbbt/util/R'
 require 'rbbt/vector/model/util'
+require 'rbbt/util/python'
+RbbtPython.add_path Rbbt.python.find(:lib)
+RbbtPython.init_rbbt
 class VectorModel
-  attr_accessor :directory, :model_file, :extract_features, :train_model, :eval_model, :post_process
+  attr_accessor :directory, :model_path, :extract_features, :init_model, :train_model, :eval_model, :post_process, :balance
   attr_accessor :features, :names, :labels, :factor_levels
-  attr_accessor :model_options
+  attr_accessor :model, :model_options
   def extract_features(&block)
     @extract_features = block if block_given?
     @extract_features
   end
+  def init_model(&block)
+    @init_model = block if block_given?
+    @init_model
+  end
   def train_model(&block)
     @train_model = block if block_given?
     @train_model
@@ -21,13 +30,17 @@ class VectorModel
     @eval_model
   end
+  def init
+    @model ||= self.instance_exec &@init_model
+  end
   def post_process(&block)
     @post_process = block if block_given?
     @post_process
   end
-  def self.R_run(model_file, features, labels, code, names = nil, factor_levels = nil)
+  def self.R_run(model_path, features, labels, code, names = nil, factor_levels = nil)
     TmpFile.with_file do |feature_file|
       Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
       Open.write(feature_file + '.label', labels * "\n" + "\n")
@@ -54,7 +67,7 @@ features = cbind(features, label = labels);
     end
   end
-  def self.R_train(model_file, features, labels, code, names = nil, factor_levels = nil)
+  def self.R_train(model_path, features, labels, code, names = nil, factor_levels = nil)
     TmpFile.with_file do |feature_file|
       Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
       Open.write(feature_file + '.label', labels * "\n" + "\n")
@@ -82,13 +95,13 @@ for (c in names(features)){
   if (is.factor(features[[c]]))
     factor_levels[c] = paste(levels(features[[c]]), collapse="\t")
 }
-rbbt.tsv.write("#{model_file}.factor_levels", factor_levels, names=c('Levels'), type='flat')
-save(model, file='#{model_file}')
+rbbt.tsv.write("#{model_path}.factor_levels", factor_levels, names=c('Levels'), type='flat')
+save(model, file='#{model_path}')
       EOF
     end
   end
-  def self.R_eval(model_file, features, list, code, names = nil, factor_levels = nil)
+  def self.R_eval(model_path, features, list, code, names = nil, factor_levels = nil)
     TmpFile.with_file do |feature_file|
       if list
         Open.write(feature_file, features.collect{|feat| feat * "\t"} * "\n" + "\n")
@@ -105,7 +118,7 @@ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=TRUE);
 #{ factor_levels.collect do |name,levels|
     "features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
   end * "\n" if factor_levels }
-load(file="#{model_file}");
+load(file="#{model_path}");
 #{code}
 cat(paste(label, sep="\\n", collapse="\\n"));
         EOF
@@ -127,61 +140,77 @@ cat(paste(label, sep="\\n", collapse="\\n"));
     instance_eval code, file
   end
-  def initialize(directory = nil, extract_features = nil, train_model = nil, eval_model = nil, post_process = nil, names = nil, factor_levels = nil)
+  def initialize(directory = nil, model_options = {})
     @directory = directory
+    @model_options = IndiferentHash.setup(model_options)
     if @directory
-      FileUtils.mkdir_p @directory unless File.exists?(@directory)
+      FileUtils.mkdir_p @directory unless File.exist?(@directory)
+      @model_path            = File.join(@directory, "model")
-      @model_file = File.join(@directory, "model")
       @extract_features_file = File.join(@directory, "features")
-      @train_model_file = File.join(@directory, "train_model")
-      @eval_model_file = File.join(@directory, "eval_model")
-      @post_process_file = File.join(@directory, "post_process")
-      @train_model_file_R = File.join(@directory, "train_model.R")
-      @eval_model_file_R = File.join(@directory, "eval_model.R")
-      @post_process_file_R = File.join(@directory, "post_process.R")
-      @names_file = File.join(@directory, "feature_names")
-      @levels_file = File.join(@directory, "levels")
-      @options_file = File.join(@directory, "options.json")
-      if File.exists?(@options_file)
-        @model_options = JSON.parse(Open.read(@options_file))
+      @init_model_path       = File.join(@directory, "init_model")
+      @train_model_path      = File.join(@directory, "train_model")
+      @train_model_path_R    = File.join(@directory, "train_model.R")
+      @eval_model_path       = File.join(@directory, "eval_model")
+      @eval_model_path_R     = File.join(@directory, "eval_model.R")
+      @post_process_file     = File.join(@directory, "post_process")
+      @post_process_file_R   = File.join(@directory, "post_process.R")
+      @names_file            = File.join(@directory, "feature_names")
+      @levels_file           = File.join(@directory, "levels")
+      @options_file          = File.join(@directory, "options.json")
+      if File.exist?(@options_file)
+        @model_options = JSON.parse(Open.read(@options_file)).merge(@model_options || {})
         IndiferentHash.setup(@model_options)
       end
     end
     if extract_features.nil?
-      if @extract_features_file && File.exists?(@extract_features_file)
+      if @extract_features_file && File.exist?(@extract_features_file)
         @extract_features = __load_method @extract_features_file
       end
     else
       @extract_features = extract_features
     end
+    if init_model.nil?
+      if @init_model_path && File.exist?(@init_model_path)
+        @init_model = __load_method @init_model_path
+      end
+    else
+      @init_model = init_model
+    end
     if train_model.nil?
-      if @train_model_file && File.exists?(@train_model_file)
-        @train_model = __load_method @train_model_file
-      elsif @train_model_file_R && File.exists?(@train_model_file_R)
-        @train_model = Open.read(@train_model_file_R)
+      if @train_model_path && File.exist?(@train_model_path)
+        @train_model = __load_method @train_model_path
+      elsif @train_model_path_R && File.exist?(@train_model_path_R)
+        @train_model = Open.read(@train_model_path_R)
       end
     else
       @train_model = train_model
     end
     if eval_model.nil?
-      if @eval_model_file && File.exists?(@eval_model_file)
-        @eval_model = __load_method @eval_model_file
-      elsif @eval_model_file_R && File.exists?(@eval_model_file_R)
-        @eval_model = Open.read(@eval_model_file_R)
+      if @eval_model_path && File.exist?(@eval_model_path)
+        @eval_model = __load_method @eval_model_path
+      elsif @eval_model_path_R && File.exist?(@eval_model_path_R)
+        @eval_model = Open.read(@eval_model_path_R)
       end
     else
       @eval_model = eval_model
     end
     if post_process.nil?
-      if @post_process_file && File.exists?(@post_process_file)
+      if @post_process_file && File.exist?(@post_process_file)
         @post_process = __load_method @post_process_file
-      elsif @post_process_file_R && File.exists?(@post_process_file_R)
+      elsif @post_process_file_R && File.exist?(@post_process_file_R)
         @post_process = Open.read(@post_process_file_R)
       end
     else
@@ -190,7 +219,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
     if names.nil?
-      if @names_file && File.exists?(@names_file)
+      if @names_file && File.exist?(@names_file)
         @names = Open.read(@names_file).split("\n")
       end
     else
@@ -198,11 +227,11 @@ cat(paste(label, sep="\\n", collapse="\\n"));
     end
     if factor_levels.nil?
-      if @levels_file && File.exists?(@levels_file)
+      if @levels_file && File.exist?(@levels_file)
         @factor_levels = YAML.load(Open.read(@levels_file))
       end
-      if @model_file && File.exists?(@model_file + '.factor_levels')
-        @factor_levels = TSV.open(@model_file + '.factor_levels')
+      if @model_path && File.exist?(@model_path + '.factor_levels')
+        @factor_levels = TSV.open(@model_path + '.factor_levels')
       end
     else
       @factor_levels = factor_levels
@@ -241,23 +270,24 @@ cat(paste(label, sep="\\n", collapse="\\n"));
     case
     when Proc === train_model
       begin
-        Open.write(@train_model_file, train_model.source)
+        Open.write(@train_model_path, train_model.source)
       rescue
       end
     when String === train_model
-      Open.write(@train_model_file_R, @train_model)
+      Open.write(@train_model_path_R, @train_model)
     end
     Open.write(@extract_features_file, @extract_features.source) if @extract_features
+    Open.write(@init_model_path, @init_model.source) if @init_model
     case
     when Proc === eval_model
       begin
-        Open.write(@eval_model_file, eval_model.source)
+        Open.write(@eval_model_path, eval_model.source)
       rescue
       end
     when String === eval_model
-      Open.write(@eval_model_file_R, eval_model)
+      Open.write(@eval_model_path_R, eval_model)
     end
     case
@@ -270,24 +300,37 @@ cat(paste(label, sep="\\n", collapse="\\n"));
       Open.write(@post_process_file_R, post_process)
     end
     Open.write(@levels_file, @factor_levels.to_yaml) if @factor_levels
     Open.write(@names_file, @names * "\n" + "\n") if @names
     Open.write(@options_file, @model_options.to_json) if @model_options
   end
   def train
-    case
-    when Proc === @train_model
-      self.instance_exec(@model_file, @features, @labels, @names, @factor_levels, &@train_model)
-    when String === @train_model
-      VectorModel.R_train(@model_file,  @features, @labels, train_model, @names, @factor_levels)
+    begin
+      if @balance
+        @original_features = @features
+        @original_labels = @labels
+        self.balance_labels
+      end
+      case
+      when Proc === @train_model
+        self.instance_exec(@features, @labels, @names, @factor_levels, &@train_model)
+      when String === @train_model
+        VectorModel.R_train(@model_path, @features, @labels, train_model, @names, @factor_levels)
+      end
+    ensure
+      if @balance
+        @features =  @original_features
+        @labels = @original_labels
+      end
     end
     save_models if @directory
   end
   def run(code)
-    VectorModel.R_run(@model_file,  @features, @labels, code, @names, @factor_levels)
+    VectorModel.R_run(@model_path,  @features, @labels, code, @names, @factor_levels)
   end
   def eval(element)
@@ -295,14 +338,14 @@ cat(paste(label, sep="\\n", collapse="\\n"));
     result = case
              when Proc === @eval_model
-               self.instance_exec(@model_file, features, false, nil, @names, @factor_levels, &@eval_model)
+               self.instance_exec(features, false, nil, @names, @factor_levels, &@eval_model)
              when String === @eval_model
-               VectorModel.R_eval(@model_file, features, false, eval_model, @names, @factor_levels)
+               VectorModel.R_eval(@model_path, features, false, eval_model, @names, @factor_levels)
              else
                raise "No @eval_model function or R script"
              end
-    result = self.instance_exec(result, &@post_process) if Proc === @post_process
+    result = self.instance_exec(result, false, &@post_process) if Proc === @post_process
     result
   end
@@ -321,12 +364,12 @@ cat(paste(label, sep="\\n", collapse="\\n"));
     result = case
              when Proc === eval_model
-               self.instance_exec(@model_file, features, true, nil, @names, @factor_levels, &@eval_model)
+               self.instance_exec(features, true, nil, @names, @factor_levels, &@eval_model)
              when String === eval_model
-               VectorModel.R_eval(@model_file, features, true, eval_model, @names, @factor_levels)
+               VectorModel.R_eval(@model_path, features, true, eval_model, @names, @factor_levels)
              end
-    result = self.instance_exec(result, &@post_process) if Proc === @post_process
+    result = self.instance_exec(result, true, &@post_process) if Proc === @post_process
     result
   end
@@ -438,6 +481,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
       @features = orig_features
       @labels = orig_labels
     end unless folds == -1
     self.reset_model if self.respond_to? :reset_model
     self.train unless folds == 1
     res

data/python/rbbt_dm/__init__.py CHANGED Viewed

@@ -1 +1,48 @@
-# Keep
+from torch.utils.data import Dataset, DataLoader
+class TSVDataset(Dataset):
+    def __init__(self, tsv):
+        self.tsv = tsv
+    def __getitem__(self, key):
+        if (type(key) == int):
+            row = self.tsv.iloc[key]
+        else:
+            row = self.tsv.loc[key]
+        row = row.to_numpy()
+        features = row[:-1]
+        label = row[-1]
+        return features, label
+    def __len__(self):
+        return len(self.tsv)
+def tsv_dataset(filename, *args, **kwargs):
+    import rbbt
+    return TSVDataset(rbbt.tsv(filename, *args, **kwargs))
+def tsv(*args, **kwargs):
+    return tsv_dataset(*args, **kwargs)
+def data_dir():
+    import rbbt
+    return rbbt.path('var/rbbt_dm/data')
+if __name__ == "__main__":
+    import rbbt
+    filename = "/home/miki/test/numeric.tsv"
+    ds = tsv(filename)
+    dl = DataLoader(ds, batch_size=1)
+    for f, l in iter(dl):
+        print(".")
+        print(f[0,:])
+        print(l[0])

data/python/rbbt_dm/atcold/__init__.py ADDED Viewed

File without changes

data/python/rbbt_dm/atcold/plot_lib.py ADDED Viewed

@@ -0,0 +1,141 @@
+from matplotlib import pyplot as plt
+import numpy as np
+import torch
+from IPython.display import HTML, display
+def set_default(figsize=(10, 10), dpi=100):
+    plt.style.use(['dark_background', 'bmh'])
+    plt.rc('axes', facecolor='k')
+    plt.rc('figure', facecolor='k')
+    plt.rc('figure', figsize=figsize, dpi=dpi)
+def plot_data(X, y, d=0, auto=False, zoom=1):
+    X = X.cpu()
+    y = y.cpu()
+    plt.scatter(X.numpy()[:, 0], X.numpy()[:, 1], c=y, s=20, cmap=plt.cm.Spectral)
+    plt.axis('square')
+    plt.axis(np.array((-1.1, 1.1, -1.1, 1.1)) * zoom)
+    if auto is True: plt.axis('equal')
+    plt.axis('off')
+    _m, _c = 0, '.15'
+    plt.axvline(0, ymin=_m, color=_c, lw=1, zorder=0)
+    plt.axhline(0, xmin=_m, color=_c, lw=1, zorder=0)
+def plot_model(X, y, model):
+    model.cpu()
+    mesh = np.arange(-1.1, 1.1, 0.01)
+    xx, yy = np.meshgrid(mesh, mesh)
+    with torch.no_grad():
+        data = torch.from_numpy(np.vstack((xx.reshape(-1), yy.reshape(-1))).T).float()
+        Z = model(data).detach()
+    Z = np.argmax(Z, axis=1).reshape(xx.shape)
+    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.3)
+    plot_data(X, y)
+def show_scatterplot(X, colors, title=''):
+    colors = colors.cpu().numpy()
+    X = X.cpu().numpy()
+    plt.figure()
+    plt.axis('equal')
+    plt.scatter(X[:, 0], X[:, 1], c=colors, s=30)
+    # plt.grid(True)
+    plt.title(title)
+    plt.axis('off')
+def plot_bases(bases, width=0.04):
+    bases = bases.cpu()
+    bases[2:] -= bases[:2]
+    plt.arrow(*bases[0], *bases[2], width=width, color=(1,0,0), zorder=10, alpha=1., length_includes_head=True)
+    plt.arrow(*bases[1], *bases[3], width=width, color=(0,1,0), zorder=10, alpha=1., length_includes_head=True)
+def show_mat(mat, vect, prod, threshold=-1):
+    # Subplot grid definition
+    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharex=False, sharey=True,
+                                        gridspec_kw={'width_ratios':[5,1,1]})
+    # Plot matrices
+    cax1 = ax1.matshow(mat.numpy(), clim=(-1, 1))
+    ax2.matshow(vect.numpy(), clim=(-1, 1))
+    cax3 = ax3.matshow(prod.numpy(), clim=(threshold, 1))
+    # Set titles
+    ax1.set_title(f'A: {mat.size(0)} \u00D7 {mat.size(1)}')
+    ax2.set_title(f'a^(i): {vect.numel()}')
+    ax3.set_title(f'p: {prod.numel()}')
+    # Remove xticks for vectors
+    ax2.set_xticks(tuple())
+    ax3.set_xticks(tuple())
+    # Plot colourbars
+    fig.colorbar(cax1, ax=ax2)
+    fig.colorbar(cax3, ax=ax3)
+    # Fix y-axis limits
+    ax1.set_ylim(bottom=max(len(prod), len(vect)) - 0.5)
+colors = dict(
+    aqua='#8dd3c7',
+    yellow='#ffffb3',
+    lavender='#bebada',
+    red='#fb8072',
+    blue='#80b1d3',
+    orange='#fdb462',
+    green='#b3de69',
+    pink='#fccde5',
+    grey='#d9d9d9',
+    violet='#bc80bd',
+    unk1='#ccebc5',
+    unk2='#ffed6f',
+)
+def _cstr(s, color='black'):
+    if s == ' ':
+        return f'<text style=color:#000;padding-left:10px;background-color:{color}> </text>'
+    else:
+        return f'<text style=color:#000;background-color:{color}>{s} </text>'
+# print html
+def _print_color(t):
+    display(HTML(''.join([_cstr(ti, color=ci) for ti, ci in t])))
+# get appropriate color for value
+def _get_clr(value):
+    colors = ('#85c2e1', '#89c4e2', '#95cae5', '#99cce6', '#a1d0e8',
+              '#b2d9ec', '#baddee', '#c2e1f0', '#eff7fb', '#f9e8e8',
+              '#f9e8e8', '#f9d4d4', '#f9bdbd', '#f8a8a8', '#f68f8f',
+              '#f47676', '#f45f5f', '#f34343', '#f33b3b', '#f42e2e')
+    value = int((value * 100) / 5)
+    if value == len(colors): value -= 1  # fixing bugs...
+    return colors[value]
+def _visualise_values(output_values, result_list):
+    text_colours = []
+    for i in range(len(output_values)):
+        text = (result_list[i], _get_clr(output_values[i]))
+        text_colours.append(text)
+    _print_color(text_colours)
+def print_colourbar():
+    color_range = torch.linspace(-2.5, 2.5, 20)
+    to_print = [(f'{x:.2f}', _get_clr((x+2.5)/5)) for x in color_range]
+    _print_color(to_print)
+# Let's only focus on the last time step for now
+# First, the cell state (Long term memory)
+def plot_state(data, state, b, decoder):
+    actual_data = decoder(data[b, :, :].numpy())
+    seq_len = len(actual_data)
+    seq_len_w_pad = len(state)
+    for s in range(state.size(2)):
+        states = torch.sigmoid(state[:, b, s])
+        _visualise_values(states[seq_len_w_pad - seq_len:], list(actual_data))

data/python/rbbt_dm/atcold/spiral.py ADDED Viewed

@@ -0,0 +1,27 @@
+import torch
+import math
+def spiral_data(N=1000, D=2, C=3):
+    X = torch.zeros(N * C, D)
+    y = torch.zeros(N * C, dtype=torch.long)
+    for c in range(C):
+        index = 0
+        t = torch.linspace(0, 1, N)
+        # When c = 0 and t = 0: start of linspace
+        # When c = 0 and t = 1: end of linpace
+        # This inner_var is for the formula inside sin() and cos() like sin(inner_var) and cos(inner_Var)
+        inner_var = torch.linspace(
+            # When t = 0
+            (2 * math.pi / C) * (c),
+            # When t = 1
+            (2 * math.pi / C) * (2 + c),
+            N
+        ) + torch.randn(N) * 0.2
+        for ix in range(N * c, N * (c + 1)):
+            X[ix] = t[index] * torch.FloatTensor((
+                math.sin(inner_var[index]), math.cos(inner_var[index])
+            ))
+            y[ix] = c
+            index += 1
+    return (X, y)