rbbt-dm 1.1.56 → 1.1.57

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8ab1295a1cb43602045e0c75226cc71e1b8eb8fbc7ce72f869f1636309745ad7
4
- data.tar.gz: 0c6bd97a2e8c81deb4435531344fd5b145382ddafd06012e93d8695de89f3ff2
3
+ metadata.gz: 072d57536b223931947dfd71d5e359961c6bfe44da0870cd2dbab440aa91ba6c
4
+ data.tar.gz: 7c5d2a3fc627992231b621b6efe4d2160aa093b11c89f31f728ede5121d2edc9
5
5
  SHA512:
6
- metadata.gz: 9f2b34158e345c703e60a94d7481e397add3651b0fc5fd7c80205b2c44ae9578e1ccac2b9d67e3679afa37f60d56fe3defed8fbd42c81765e1239b90e8bc06da
7
- data.tar.gz: fc1d764b9a240b60bd1de30e22f534485f7319c0396b291e28b0fe1ad48acde7cdeb5de7448509550892cd17e743b5388348f1f40584233280b5ec42cc95940c
6
+ metadata.gz: 5216b3179bc4a809829b79bc4c447159e88444dc7943da3d0c3643b728e3eb5a6da7c08a8538fb88db1e463f95e55e104976ac81f010aeaf729a6cb1c4ca1374
7
+ data.tar.gz: 12baae07f04ca3770dfef81c3166385badb7944d25b7072364631e5f93b419e04c53d5b7157934b3d1522ef83e84acf0c4e2c5d39ad3604b7ef8f5f460d2f750
@@ -4,13 +4,13 @@ require 'rbbt/nlp/spaCy'
4
4
  class SpaCyModel < VectorModel
5
5
  attr_accessor :config
6
6
 
7
- def spacy(&block)
7
+ def self.spacy(&block)
8
8
  RbbtPython.run "spacy" do
9
9
  RbbtPython.module_eval(&block)
10
10
  end
11
11
  end
12
12
 
13
- def initialize(dir, config, lang = 'en_core_web_md')
13
+ def initialize(dir, config, categories = %w(positive negative), lang = 'en_core_web_md')
14
14
  @config = case
15
15
  when Path === config
16
16
  config.read
@@ -33,17 +33,18 @@ class SpaCyModel < VectorModel
33
33
  tmpconfig = File.join(file, 'config')
34
34
  tmptrain = File.join(file, 'train.spacy')
35
35
  SpaCy.config(@config, tmpconfig)
36
- spacy do
36
+ SpaCyModel.spacy do
37
37
  nlp = SpaCy.nlp(lang)
38
38
  docs = []
39
39
  RbbtPython.iterate nlp.pipe(texts.zip(labels), as_tuples: true), :bar => "Training documents into spacy format" do |doc,label|
40
- if %w(1 true pos).include?(label.to_s.downcase)
41
- doc.cats["positive"] = 1
42
- doc.cats["negative"] = 0
43
- else
44
- doc.cats["positive"] = 0
45
- doc.cats["negative"] = 1
46
- end
40
+ doc.cats[label] = 1
41
+ #if %w(1 true pos).include?(label.to_s.downcase)
42
+ # doc.cats["positive"] = 1
43
+ # doc.cats["negative"] = 0
44
+ #else
45
+ # doc.cats["positive"] = 0
46
+ # doc.cats["negative"] = 1
47
+ #end
47
48
  docs << doc
48
49
  end
49
50
 
@@ -51,7 +52,7 @@ class SpaCyModel < VectorModel
51
52
  doc_bin.to_disk(tmptrain)
52
53
  end
53
54
 
54
- gpu = Rbbt::Config.get('gpu_id', :spacy, :spacy_train)
55
+ gpu = Rbbt::Config.get('gpu_id', :spacy, :spacy_train, :default => 0)
55
56
  CMD.cmd_log(:spacy, "train #{tmpconfig} --output #{file} --paths.train #{tmptrain} --paths.dev #{tmptrain}", "--gpu-id" => gpu)
56
57
  end
57
58
 
@@ -59,14 +60,15 @@ class SpaCyModel < VectorModel
59
60
  texts = features
60
61
 
61
62
  docs = []
62
- spacy do
63
+ SpaCyModel.spacy do
63
64
  nlp = spacy.load("#{file}/model-best")
64
65
 
65
66
  Log::ProgressBar.with_bar texts.length, :desc => "Evaluating documents" do |bar|
66
67
  texts.collect do |text|
67
68
  cats = nlp.(text).cats
68
69
  bar.tick
69
- cats['positive'] > cats['negative'] ? 1 : 0
70
+ cats.sort_by{|l,v| v.to_f }.last.first
71
+ #cats['positive'] > cats['negative'] ? 1 : 0
70
72
  end
71
73
  end
72
74
  end
@@ -320,6 +320,8 @@ cat(paste(label, sep="\\n", collapse="\\n"));
320
320
  acc = []
321
321
  labels.each do |good_label|
322
322
  values = VectorModel.f1_metrics(test, predicted, good_label)
323
+ tp, tn, fp, fn, pr, re, f1 = values
324
+ Log.debug "Partial CV #{good_label} - P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
323
325
  acc << values
324
326
  end
325
327
  Misc.zip_fields(acc).collect{|s| Misc.mean(s)}
@@ -340,12 +342,21 @@ cat(paste(label, sep="\\n", collapse="\\n"));
340
342
  end
341
343
 
342
344
  begin
343
- feature_folds = Misc.divide(@features, folds)
344
- labels_folds = Misc.divide(@labels, folds)
345
+ if folds == 1
346
+ feature_folds = [@features]
347
+ labels_folds = [@labels]
348
+ else
349
+ feature_folds = Misc.divide(@features, folds)
350
+ labels_folds = Misc.divide(@labels, folds)
351
+ end
345
352
 
346
353
  folds.times do |fix|
347
354
 
348
- rest = (0..(folds-1)).to_a - [fix]
355
+ if folds == 1
356
+ rest = [fix]
357
+ else
358
+ rest = (0..(folds-1)).to_a - [fix]
359
+ end
349
360
 
350
361
  test_set = feature_folds[fix]
351
362
  train_set = feature_folds.values_at(*rest).inject([]){|acc,e| acc += e; acc}
@@ -355,6 +366,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
355
366
 
356
367
  @features = train_set
357
368
  @labels = train_labels
369
+
358
370
  self.train
359
371
  predictions = self.eval_list test_set, false
360
372
 
@@ -362,6 +374,8 @@ cat(paste(label, sep="\\n", collapse="\\n"));
362
374
 
363
375
  different_labels = test_labels.uniq
364
376
 
377
+ Log.debug do "Accuracy Fold #{fix}: #{(100 * test_labels.zip(predictions).select{|t,p| t == p }.length.to_f / test_labels.length).round(2)}%" end
378
+
365
379
  tp, tn, fp, fn, pr, re, f1 = VectorModel.f1_metrics(test_labels, predictions, good_label)
366
380
 
367
381
  if multiclass
@@ -377,7 +391,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
377
391
  @features = orig_features
378
392
  @labels = orig_labels
379
393
  end
380
- self.train
394
+ self.train unless folds == 1
381
395
  res
382
396
  end
383
397
  end
@@ -0,0 +1,86 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = null
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["tok2vec","textcat_multilabel"]
14
+ batch_size = 1000
15
+
16
+ [components]
17
+
18
+ [components.tok2vec]
19
+ factory = "tok2vec"
20
+
21
+ [components.tok2vec.model]
22
+ @architectures = "spacy.Tok2Vec.v2"
23
+
24
+ [components.tok2vec.model.embed]
25
+ @architectures = "spacy.MultiHashEmbed.v2"
26
+ width = ${components.tok2vec.model.encode.width}
27
+ attrs = ["ORTH", "SHAPE"]
28
+ rows = [5000, 2500]
29
+ include_static_vectors = true
30
+
31
+ [components.tok2vec.model.encode]
32
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
33
+ width = 256
34
+ depth = 8
35
+ window_size = 1
36
+ maxout_pieces = 3
37
+
38
+ [components.textcat_multilabel]
39
+ factory = "textcat_multilabel"
40
+
41
+ [components.textcat_multilabel.model]
42
+ @architectures = "spacy.TextCatEnsemble.v2"
43
+ nO = null
44
+
45
+ [components.textcat_multilabel.model.tok2vec]
46
+ @architectures = "spacy.Tok2VecListener.v1"
47
+ width = ${components.tok2vec.model.encode.width}
48
+
49
+ [components.textcat_multilabel.model.linear_model]
50
+ @architectures = "spacy.TextCatBOW.v1"
51
+ exclusive_classes = true
52
+ ngram_size = 1
53
+ no_output_layer = false
54
+
55
+ [corpora]
56
+
57
+ [corpora.train]
58
+ @readers = "spacy.Corpus.v1"
59
+ path = ${paths.train}
60
+ max_length = 2000
61
+
62
+ [corpora.dev]
63
+ @readers = "spacy.Corpus.v1"
64
+ path = ${paths.dev}
65
+ max_length = 0
66
+
67
+ [training]
68
+ dev_corpus = "corpora.dev"
69
+ train_corpus = "corpora.train"
70
+
71
+ [training.optimizer]
72
+ @optimizers = "Adam.v1"
73
+
74
+ [training.batcher]
75
+ @batchers = "spacy.batch_by_words.v1"
76
+ discard_oversize = false
77
+ tolerance = 0.2
78
+
79
+ [training.batcher.size]
80
+ @schedules = "compounding.v1"
81
+ start = 100
82
+ stop = 1000
83
+ compound = 1.001
84
+
85
+ [initialize]
86
+ vectors = "en_core_web_lg"
@@ -0,0 +1,78 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = null
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["tok2vec","textcat_multilabel"]
14
+ batch_size = 1000
15
+
16
+ [components]
17
+
18
+ [components.tok2vec]
19
+ factory = "tok2vec"
20
+
21
+ [components.tok2vec.model]
22
+ @architectures = "spacy.Tok2Vec.v2"
23
+
24
+ [components.tok2vec.model.embed]
25
+ @architectures = "spacy.MultiHashEmbed.v2"
26
+ width = ${components.tok2vec.model.encode.width}
27
+ attrs = ["ORTH", "SHAPE"]
28
+ rows = [5000, 2500]
29
+ include_static_vectors = false
30
+
31
+ [components.tok2vec.model.encode]
32
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
33
+ width = 96
34
+ depth = 4
35
+ window_size = 1
36
+ maxout_pieces = 3
37
+
38
+ [components.textcat_multilabel]
39
+ factory = "textcat_multilabel"
40
+
41
+ [components.textcat_multilabel.model]
42
+ @architectures = "spacy.TextCatBOW.v1"
43
+ exclusive_classes = true
44
+ ngram_size = 1
45
+ no_output_layer = false
46
+
47
+ [corpora]
48
+
49
+ [corpora.train]
50
+ @readers = "spacy.Corpus.v1"
51
+ path = ${paths.train}
52
+ max_length = 2000
53
+
54
+ [corpora.dev]
55
+ @readers = "spacy.Corpus.v1"
56
+ path = ${paths.dev}
57
+ max_length = 0
58
+
59
+ [training]
60
+ dev_corpus = "corpora.dev"
61
+ train_corpus = "corpora.train"
62
+
63
+ [training.optimizer]
64
+ @optimizers = "Adam.v1"
65
+
66
+ [training.batcher]
67
+ @batchers = "spacy.batch_by_words.v1"
68
+ discard_oversize = false
69
+ tolerance = 0.2
70
+
71
+ [training.batcher.size]
72
+ @schedules = "compounding.v1"
73
+ start = 100
74
+ stop = 1000
75
+ compound = 1.001
76
+
77
+ [initialize]
78
+ vectors = null
@@ -0,0 +1,84 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = "pytorch"
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["transformer","textcat_multilabel"]
14
+ batch_size = 128
15
+
16
+ [components]
17
+
18
+ [components.transformer]
19
+ factory = "transformer"
20
+
21
+ [components.transformer.model]
22
+ @architectures = "spacy-transformers.TransformerModel.v1"
23
+ name = "emilyalsentzer/Bio_ClinicalBERT"
24
+ tokenizer_config = {"use_fast": true}
25
+
26
+ [components.transformer.model.get_spans]
27
+ @span_getters = "spacy-transformers.strided_spans.v1"
28
+ window = 128
29
+ stride = 96
30
+
31
+ [components.textcat_multilabel]
32
+ factory = "textcat_multilabel"
33
+
34
+ [components.textcat_multilabel.model]
35
+ @architectures = "spacy.TextCatEnsemble.v2"
36
+ nO = null
37
+
38
+ [components.textcat_multilabel.model.tok2vec]
39
+ @architectures = "spacy-transformers.TransformerListener.v1"
40
+ grad_factor = 1.0
41
+
42
+ [components.textcat_multilabel.model.tok2vec.pooling]
43
+ @layers = "reduce_mean.v1"
44
+
45
+ [components.textcat_multilabel.model.linear_model]
46
+ @architectures = "spacy.TextCatBOW.v1"
47
+ exclusive_classes = true
48
+ ngram_size = 1
49
+ no_output_layer = false
50
+
51
+ [corpora]
52
+
53
+ [corpora.train]
54
+ @readers = "spacy.Corpus.v1"
55
+ path = ${paths.train}
56
+ max_length = 500
57
+
58
+ [corpora.dev]
59
+ @readers = "spacy.Corpus.v1"
60
+ path = ${paths.dev}
61
+ max_length = 0
62
+
63
+ [training]
64
+ accumulate_gradient = 3
65
+ dev_corpus = "corpora.dev"
66
+ train_corpus = "corpora.train"
67
+
68
+ [training.optimizer]
69
+ @optimizers = "Adam.v1"
70
+
71
+ [training.optimizer.learn_rate]
72
+ @schedules = "warmup_linear.v1"
73
+ warmup_steps = 250
74
+ total_steps = 20000
75
+ initial_rate = 5e-5
76
+
77
+ [training.batcher]
78
+ @batchers = "spacy.batch_by_padded.v1"
79
+ discard_oversize = true
80
+ size = 2000
81
+ buffer = 256
82
+
83
+ [initialize]
84
+ vectors = null
@@ -0,0 +1,73 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = "pytorch"
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["transformer","textcat_multilabel"]
14
+ batch_size = 128
15
+
16
+ [components]
17
+
18
+ [components.transformer]
19
+ factory = "transformer"
20
+
21
+ [components.transformer.model]
22
+ @architectures = "spacy-transformers.TransformerModel.v1"
23
+ name = "roberta-base"
24
+ tokenizer_config = {"use_fast": true}
25
+
26
+ [components.transformer.model.get_spans]
27
+ @span_getters = "spacy-transformers.strided_spans.v1"
28
+ window = 128
29
+ stride = 96
30
+
31
+ [components.textcat_multilabel]
32
+ factory = "textcat_multilabel"
33
+
34
+ [components.textcat_multilabel.model]
35
+ @architectures = "spacy.TextCatBOW.v1"
36
+ exclusive_classes = true
37
+ ngram_size = 1
38
+ no_output_layer = false
39
+
40
+ [corpora]
41
+
42
+ [corpora.train]
43
+ @readers = "spacy.Corpus.v1"
44
+ path = ${paths.train}
45
+ max_length = 500
46
+
47
+ [corpora.dev]
48
+ @readers = "spacy.Corpus.v1"
49
+ path = ${paths.dev}
50
+ max_length = 0
51
+
52
+ [training]
53
+ accumulate_gradient = 3
54
+ dev_corpus = "corpora.dev"
55
+ train_corpus = "corpora.train"
56
+
57
+ [training.optimizer]
58
+ @optimizers = "Adam.v1"
59
+
60
+ [training.optimizer.learn_rate]
61
+ @schedules = "warmup_linear.v1"
62
+ warmup_steps = 250
63
+ total_steps = 20000
64
+ initial_rate = 5e-5
65
+
66
+ [training.batcher]
67
+ @batchers = "spacy.batch_by_padded.v1"
68
+ discard_oversize = true
69
+ size = 2000
70
+ buffer = 256
71
+
72
+ [initialize]
73
+ vectors = null
@@ -23,18 +23,23 @@ class TestSpaCyModel < Test::Unit::TestCase
23
23
  good = tsv.select("Recommended IND" => '1')
24
24
  bad = tsv.select("Recommended IND" => '0')
25
25
 
26
- gsize = 2000
27
- bsize = 500
26
+ gsize = 200
27
+ bsize = 50
28
28
  good.keys[0..gsize-1].each do |text|
29
29
  next if text.nil? || text.empty?
30
- model.add text, '1'
30
+ model.add text, 'good'
31
31
  end
32
32
 
33
33
  bad.keys[0..bsize-1].each do |text|
34
- model.add text, '0'
34
+ model.add text, 'bad'
35
35
  end
36
36
 
37
- model.cross_validation
37
+ model.cross_validation 1
38
+
39
+ model = VectorModel.new dir
40
+
41
+ assert Misc.counts(model.eval_list(good.keys[0..50]))['good'] > 40
42
+ assert Misc.counts(model.eval_list(bad.keys[0..50]))['bad'] > 40
38
43
  end
39
44
 
40
45
  def test_svm_spacy
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-dm
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.56
4
+ version: 1.1.57
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-12-20 00:00:00.000000000 Z
11
+ date: 2022-05-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -116,8 +116,12 @@ files:
116
116
  - share/R/heatmap.3.R
117
117
  - share/spaCy/cpu/textcat_accuracy.conf
118
118
  - share/spaCy/cpu/textcat_efficiency.conf
119
+ - share/spaCy/cpu/textcat_multilabel_accuracy.conf
120
+ - share/spaCy/cpu/textcat_multilabel_efficiency.conf
119
121
  - share/spaCy/gpu/textcat_accuracy.conf
120
122
  - share/spaCy/gpu/textcat_efficiency.conf
123
+ - share/spaCy/gpu/textcat_multilabel_accuracy.conf
124
+ - share/spaCy/gpu/textcat_multilabel_efficiency.conf
121
125
  - test/rbbt/matrix/test_barcode.rb
122
126
  - test/rbbt/network/test_paths.rb
123
127
  - test/rbbt/statistics/test_fdr.rb