rbbt-dm 1.1.56 → 1.1.57

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8ab1295a1cb43602045e0c75226cc71e1b8eb8fbc7ce72f869f1636309745ad7
4
- data.tar.gz: 0c6bd97a2e8c81deb4435531344fd5b145382ddafd06012e93d8695de89f3ff2
3
+ metadata.gz: 072d57536b223931947dfd71d5e359961c6bfe44da0870cd2dbab440aa91ba6c
4
+ data.tar.gz: 7c5d2a3fc627992231b621b6efe4d2160aa093b11c89f31f728ede5121d2edc9
5
5
  SHA512:
6
- metadata.gz: 9f2b34158e345c703e60a94d7481e397add3651b0fc5fd7c80205b2c44ae9578e1ccac2b9d67e3679afa37f60d56fe3defed8fbd42c81765e1239b90e8bc06da
7
- data.tar.gz: fc1d764b9a240b60bd1de30e22f534485f7319c0396b291e28b0fe1ad48acde7cdeb5de7448509550892cd17e743b5388348f1f40584233280b5ec42cc95940c
6
+ metadata.gz: 5216b3179bc4a809829b79bc4c447159e88444dc7943da3d0c3643b728e3eb5a6da7c08a8538fb88db1e463f95e55e104976ac81f010aeaf729a6cb1c4ca1374
7
+ data.tar.gz: 12baae07f04ca3770dfef81c3166385badb7944d25b7072364631e5f93b419e04c53d5b7157934b3d1522ef83e84acf0c4e2c5d39ad3604b7ef8f5f460d2f750
@@ -4,13 +4,13 @@ require 'rbbt/nlp/spaCy'
4
4
  class SpaCyModel < VectorModel
5
5
  attr_accessor :config
6
6
 
7
- def spacy(&block)
7
+ def self.spacy(&block)
8
8
  RbbtPython.run "spacy" do
9
9
  RbbtPython.module_eval(&block)
10
10
  end
11
11
  end
12
12
 
13
- def initialize(dir, config, lang = 'en_core_web_md')
13
+ def initialize(dir, config, categories = %w(positive negative), lang = 'en_core_web_md')
14
14
  @config = case
15
15
  when Path === config
16
16
  config.read
@@ -33,17 +33,18 @@ class SpaCyModel < VectorModel
33
33
  tmpconfig = File.join(file, 'config')
34
34
  tmptrain = File.join(file, 'train.spacy')
35
35
  SpaCy.config(@config, tmpconfig)
36
- spacy do
36
+ SpaCyModel.spacy do
37
37
  nlp = SpaCy.nlp(lang)
38
38
  docs = []
39
39
  RbbtPython.iterate nlp.pipe(texts.zip(labels), as_tuples: true), :bar => "Training documents into spacy format" do |doc,label|
40
- if %w(1 true pos).include?(label.to_s.downcase)
41
- doc.cats["positive"] = 1
42
- doc.cats["negative"] = 0
43
- else
44
- doc.cats["positive"] = 0
45
- doc.cats["negative"] = 1
46
- end
40
+ doc.cats[label] = 1
41
+ #if %w(1 true pos).include?(label.to_s.downcase)
42
+ # doc.cats["positive"] = 1
43
+ # doc.cats["negative"] = 0
44
+ #else
45
+ # doc.cats["positive"] = 0
46
+ # doc.cats["negative"] = 1
47
+ #end
47
48
  docs << doc
48
49
  end
49
50
 
@@ -51,7 +52,7 @@ class SpaCyModel < VectorModel
51
52
  doc_bin.to_disk(tmptrain)
52
53
  end
53
54
 
54
- gpu = Rbbt::Config.get('gpu_id', :spacy, :spacy_train)
55
+ gpu = Rbbt::Config.get('gpu_id', :spacy, :spacy_train, :default => 0)
55
56
  CMD.cmd_log(:spacy, "train #{tmpconfig} --output #{file} --paths.train #{tmptrain} --paths.dev #{tmptrain}", "--gpu-id" => gpu)
56
57
  end
57
58
 
@@ -59,14 +60,15 @@ class SpaCyModel < VectorModel
59
60
  texts = features
60
61
 
61
62
  docs = []
62
- spacy do
63
+ SpaCyModel.spacy do
63
64
  nlp = spacy.load("#{file}/model-best")
64
65
 
65
66
  Log::ProgressBar.with_bar texts.length, :desc => "Evaluating documents" do |bar|
66
67
  texts.collect do |text|
67
68
  cats = nlp.(text).cats
68
69
  bar.tick
69
- cats['positive'] > cats['negative'] ? 1 : 0
70
+ cats.sort_by{|l,v| v.to_f }.last.first
71
+ #cats['positive'] > cats['negative'] ? 1 : 0
70
72
  end
71
73
  end
72
74
  end
@@ -320,6 +320,8 @@ cat(paste(label, sep="\\n", collapse="\\n"));
320
320
  acc = []
321
321
  labels.each do |good_label|
322
322
  values = VectorModel.f1_metrics(test, predicted, good_label)
323
+ tp, tn, fp, fn, pr, re, f1 = values
324
+ Log.debug "Partial CV #{good_label} - P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
323
325
  acc << values
324
326
  end
325
327
  Misc.zip_fields(acc).collect{|s| Misc.mean(s)}
@@ -340,12 +342,21 @@ cat(paste(label, sep="\\n", collapse="\\n"));
340
342
  end
341
343
 
342
344
  begin
343
- feature_folds = Misc.divide(@features, folds)
344
- labels_folds = Misc.divide(@labels, folds)
345
+ if folds == 1
346
+ feature_folds = [@features]
347
+ labels_folds = [@labels]
348
+ else
349
+ feature_folds = Misc.divide(@features, folds)
350
+ labels_folds = Misc.divide(@labels, folds)
351
+ end
345
352
 
346
353
  folds.times do |fix|
347
354
 
348
- rest = (0..(folds-1)).to_a - [fix]
355
+ if folds == 1
356
+ rest = [fix]
357
+ else
358
+ rest = (0..(folds-1)).to_a - [fix]
359
+ end
349
360
 
350
361
  test_set = feature_folds[fix]
351
362
  train_set = feature_folds.values_at(*rest).inject([]){|acc,e| acc += e; acc}
@@ -355,6 +366,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
355
366
 
356
367
  @features = train_set
357
368
  @labels = train_labels
369
+
358
370
  self.train
359
371
  predictions = self.eval_list test_set, false
360
372
 
@@ -362,6 +374,8 @@ cat(paste(label, sep="\\n", collapse="\\n"));
362
374
 
363
375
  different_labels = test_labels.uniq
364
376
 
377
+ Log.debug do "Accuracy Fold #{fix}: #{(100 * test_labels.zip(predictions).select{|t,p| t == p }.length.to_f / test_labels.length).round(2)}%" end
378
+
365
379
  tp, tn, fp, fn, pr, re, f1 = VectorModel.f1_metrics(test_labels, predictions, good_label)
366
380
 
367
381
  if multiclass
@@ -377,7 +391,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
377
391
  @features = orig_features
378
392
  @labels = orig_labels
379
393
  end
380
- self.train
394
+ self.train unless folds == 1
381
395
  res
382
396
  end
383
397
  end
@@ -0,0 +1,86 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = null
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["tok2vec","textcat_multilabel"]
14
+ batch_size = 1000
15
+
16
+ [components]
17
+
18
+ [components.tok2vec]
19
+ factory = "tok2vec"
20
+
21
+ [components.tok2vec.model]
22
+ @architectures = "spacy.Tok2Vec.v2"
23
+
24
+ [components.tok2vec.model.embed]
25
+ @architectures = "spacy.MultiHashEmbed.v2"
26
+ width = ${components.tok2vec.model.encode.width}
27
+ attrs = ["ORTH", "SHAPE"]
28
+ rows = [5000, 2500]
29
+ include_static_vectors = true
30
+
31
+ [components.tok2vec.model.encode]
32
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
33
+ width = 256
34
+ depth = 8
35
+ window_size = 1
36
+ maxout_pieces = 3
37
+
38
+ [components.textcat_multilabel]
39
+ factory = "textcat_multilabel"
40
+
41
+ [components.textcat_multilabel.model]
42
+ @architectures = "spacy.TextCatEnsemble.v2"
43
+ nO = null
44
+
45
+ [components.textcat_multilabel.model.tok2vec]
46
+ @architectures = "spacy.Tok2VecListener.v1"
47
+ width = ${components.tok2vec.model.encode.width}
48
+
49
+ [components.textcat_multilabel.model.linear_model]
50
+ @architectures = "spacy.TextCatBOW.v1"
51
+ exclusive_classes = true
52
+ ngram_size = 1
53
+ no_output_layer = false
54
+
55
+ [corpora]
56
+
57
+ [corpora.train]
58
+ @readers = "spacy.Corpus.v1"
59
+ path = ${paths.train}
60
+ max_length = 2000
61
+
62
+ [corpora.dev]
63
+ @readers = "spacy.Corpus.v1"
64
+ path = ${paths.dev}
65
+ max_length = 0
66
+
67
+ [training]
68
+ dev_corpus = "corpora.dev"
69
+ train_corpus = "corpora.train"
70
+
71
+ [training.optimizer]
72
+ @optimizers = "Adam.v1"
73
+
74
+ [training.batcher]
75
+ @batchers = "spacy.batch_by_words.v1"
76
+ discard_oversize = false
77
+ tolerance = 0.2
78
+
79
+ [training.batcher.size]
80
+ @schedules = "compounding.v1"
81
+ start = 100
82
+ stop = 1000
83
+ compound = 1.001
84
+
85
+ [initialize]
86
+ vectors = "en_core_web_lg"
@@ -0,0 +1,78 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = null
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["tok2vec","textcat_multilabel"]
14
+ batch_size = 1000
15
+
16
+ [components]
17
+
18
+ [components.tok2vec]
19
+ factory = "tok2vec"
20
+
21
+ [components.tok2vec.model]
22
+ @architectures = "spacy.Tok2Vec.v2"
23
+
24
+ [components.tok2vec.model.embed]
25
+ @architectures = "spacy.MultiHashEmbed.v2"
26
+ width = ${components.tok2vec.model.encode.width}
27
+ attrs = ["ORTH", "SHAPE"]
28
+ rows = [5000, 2500]
29
+ include_static_vectors = false
30
+
31
+ [components.tok2vec.model.encode]
32
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
33
+ width = 96
34
+ depth = 4
35
+ window_size = 1
36
+ maxout_pieces = 3
37
+
38
+ [components.textcat_multilabel]
39
+ factory = "textcat_multilabel"
40
+
41
+ [components.textcat_multilabel.model]
42
+ @architectures = "spacy.TextCatBOW.v1"
43
+ exclusive_classes = true
44
+ ngram_size = 1
45
+ no_output_layer = false
46
+
47
+ [corpora]
48
+
49
+ [corpora.train]
50
+ @readers = "spacy.Corpus.v1"
51
+ path = ${paths.train}
52
+ max_length = 2000
53
+
54
+ [corpora.dev]
55
+ @readers = "spacy.Corpus.v1"
56
+ path = ${paths.dev}
57
+ max_length = 0
58
+
59
+ [training]
60
+ dev_corpus = "corpora.dev"
61
+ train_corpus = "corpora.train"
62
+
63
+ [training.optimizer]
64
+ @optimizers = "Adam.v1"
65
+
66
+ [training.batcher]
67
+ @batchers = "spacy.batch_by_words.v1"
68
+ discard_oversize = false
69
+ tolerance = 0.2
70
+
71
+ [training.batcher.size]
72
+ @schedules = "compounding.v1"
73
+ start = 100
74
+ stop = 1000
75
+ compound = 1.001
76
+
77
+ [initialize]
78
+ vectors = null
@@ -0,0 +1,84 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = "pytorch"
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["transformer","textcat_multilabel"]
14
+ batch_size = 128
15
+
16
+ [components]
17
+
18
+ [components.transformer]
19
+ factory = "transformer"
20
+
21
+ [components.transformer.model]
22
+ @architectures = "spacy-transformers.TransformerModel.v1"
23
+ name = "emilyalsentzer/Bio_ClinicalBERT"
24
+ tokenizer_config = {"use_fast": true}
25
+
26
+ [components.transformer.model.get_spans]
27
+ @span_getters = "spacy-transformers.strided_spans.v1"
28
+ window = 128
29
+ stride = 96
30
+
31
+ [components.textcat_multilabel]
32
+ factory = "textcat_multilabel"
33
+
34
+ [components.textcat_multilabel.model]
35
+ @architectures = "spacy.TextCatEnsemble.v2"
36
+ nO = null
37
+
38
+ [components.textcat_multilabel.model.tok2vec]
39
+ @architectures = "spacy-transformers.TransformerListener.v1"
40
+ grad_factor = 1.0
41
+
42
+ [components.textcat_multilabel.model.tok2vec.pooling]
43
+ @layers = "reduce_mean.v1"
44
+
45
+ [components.textcat_multilabel.model.linear_model]
46
+ @architectures = "spacy.TextCatBOW.v1"
47
+ exclusive_classes = true
48
+ ngram_size = 1
49
+ no_output_layer = false
50
+
51
+ [corpora]
52
+
53
+ [corpora.train]
54
+ @readers = "spacy.Corpus.v1"
55
+ path = ${paths.train}
56
+ max_length = 500
57
+
58
+ [corpora.dev]
59
+ @readers = "spacy.Corpus.v1"
60
+ path = ${paths.dev}
61
+ max_length = 0
62
+
63
+ [training]
64
+ accumulate_gradient = 3
65
+ dev_corpus = "corpora.dev"
66
+ train_corpus = "corpora.train"
67
+
68
+ [training.optimizer]
69
+ @optimizers = "Adam.v1"
70
+
71
+ [training.optimizer.learn_rate]
72
+ @schedules = "warmup_linear.v1"
73
+ warmup_steps = 250
74
+ total_steps = 20000
75
+ initial_rate = 5e-5
76
+
77
+ [training.batcher]
78
+ @batchers = "spacy.batch_by_padded.v1"
79
+ discard_oversize = true
80
+ size = 2000
81
+ buffer = 256
82
+
83
+ [initialize]
84
+ vectors = null
@@ -0,0 +1,73 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = "pytorch"
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["transformer","textcat_multilabel"]
14
+ batch_size = 128
15
+
16
+ [components]
17
+
18
+ [components.transformer]
19
+ factory = "transformer"
20
+
21
+ [components.transformer.model]
22
+ @architectures = "spacy-transformers.TransformerModel.v1"
23
+ name = "roberta-base"
24
+ tokenizer_config = {"use_fast": true}
25
+
26
+ [components.transformer.model.get_spans]
27
+ @span_getters = "spacy-transformers.strided_spans.v1"
28
+ window = 128
29
+ stride = 96
30
+
31
+ [components.textcat_multilabel]
32
+ factory = "textcat_multilabel"
33
+
34
+ [components.textcat_multilabel.model]
35
+ @architectures = "spacy.TextCatBOW.v1"
36
+ exclusive_classes = true
37
+ ngram_size = 1
38
+ no_output_layer = false
39
+
40
+ [corpora]
41
+
42
+ [corpora.train]
43
+ @readers = "spacy.Corpus.v1"
44
+ path = ${paths.train}
45
+ max_length = 500
46
+
47
+ [corpora.dev]
48
+ @readers = "spacy.Corpus.v1"
49
+ path = ${paths.dev}
50
+ max_length = 0
51
+
52
+ [training]
53
+ accumulate_gradient = 3
54
+ dev_corpus = "corpora.dev"
55
+ train_corpus = "corpora.train"
56
+
57
+ [training.optimizer]
58
+ @optimizers = "Adam.v1"
59
+
60
+ [training.optimizer.learn_rate]
61
+ @schedules = "warmup_linear.v1"
62
+ warmup_steps = 250
63
+ total_steps = 20000
64
+ initial_rate = 5e-5
65
+
66
+ [training.batcher]
67
+ @batchers = "spacy.batch_by_padded.v1"
68
+ discard_oversize = true
69
+ size = 2000
70
+ buffer = 256
71
+
72
+ [initialize]
73
+ vectors = null
@@ -23,18 +23,23 @@ class TestSpaCyModel < Test::Unit::TestCase
23
23
  good = tsv.select("Recommended IND" => '1')
24
24
  bad = tsv.select("Recommended IND" => '0')
25
25
 
26
- gsize = 2000
27
- bsize = 500
26
+ gsize = 200
27
+ bsize = 50
28
28
  good.keys[0..gsize-1].each do |text|
29
29
  next if text.nil? || text.empty?
30
- model.add text, '1'
30
+ model.add text, 'good'
31
31
  end
32
32
 
33
33
  bad.keys[0..bsize-1].each do |text|
34
- model.add text, '0'
34
+ model.add text, 'bad'
35
35
  end
36
36
 
37
- model.cross_validation
37
+ model.cross_validation 1
38
+
39
+ model = VectorModel.new dir
40
+
41
+ assert Misc.counts(model.eval_list(good.keys[0..50]))['good'] > 40
42
+ assert Misc.counts(model.eval_list(bad.keys[0..50]))['bad'] > 40
38
43
  end
39
44
 
40
45
  def test_svm_spacy
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-dm
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.56
4
+ version: 1.1.57
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-12-20 00:00:00.000000000 Z
11
+ date: 2022-05-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -116,8 +116,12 @@ files:
116
116
  - share/R/heatmap.3.R
117
117
  - share/spaCy/cpu/textcat_accuracy.conf
118
118
  - share/spaCy/cpu/textcat_efficiency.conf
119
+ - share/spaCy/cpu/textcat_multilabel_accuracy.conf
120
+ - share/spaCy/cpu/textcat_multilabel_efficiency.conf
119
121
  - share/spaCy/gpu/textcat_accuracy.conf
120
122
  - share/spaCy/gpu/textcat_efficiency.conf
123
+ - share/spaCy/gpu/textcat_multilabel_accuracy.conf
124
+ - share/spaCy/gpu/textcat_multilabel_efficiency.conf
121
125
  - test/rbbt/matrix/test_barcode.rb
122
126
  - test/rbbt/network/test_paths.rb
123
127
  - test/rbbt/statistics/test_fdr.rb