rbbt-text 1.3.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c2a24d8e7faf30d53e41a00a27f6145e8e9f18f0c10af57cdddaea0ee18c35d6
4
- data.tar.gz: 3475006965110391e35151cd1b5368028dacf467aa276f8eb68fce3320be1122
3
+ metadata.gz: a6965ecde1b38d5bc93d4836ee6d757e2add39a51d64c2f06142bbbd303e22d7
4
+ data.tar.gz: a5c32ea03ea8214dd8c94ef6e884b59e459e3a7a8e3d26065a0a046b5b9b4778
5
5
  SHA512:
6
- metadata.gz: da40a039a4792eb5e7fa00270870279221c74dcbf51df1b5278b247496fefbfa888a87b7ab19f05676644c51a01177eb49e229cb0156fe7f0190dd4933d41e24
7
- data.tar.gz: a32fca5f21a987dcbb6b5541015cc33879330e6f1ef7c4a28e75debe5bdd1dc8bf7b98bfc91d828e605f29868aa972b55cd59bb4f86e66d2fb0cfea31fac2ae0
6
+ metadata.gz: 756d240a796e5ac88b4b55368e0e4e3af14b3dd2d8b8b55e49839c3cdc3fa45ee807d648cf86b45b62e7f2f4d9e7fc15567ab21d3356e37a5c3c4316cbcaa841
7
+ data.tar.gz: 6caa03ec51185cac00cc436bac999b063fccfcc1dbf0e2c09359dad7171c0eea37f80436cc860038a2c1ad17eb9b67a03e88d1ae8ef406ce1c5c874d375d1abd
@@ -69,6 +69,11 @@ module BagOfWords
69
69
  count = bigrams ? count(bigrams(text)) : count(words(text))
70
70
  count.values_at(*terms)
71
71
  end
72
+
73
+ def self.weighted_features(text, weights)
74
+ features = features(text, weights.keys)
75
+ features.zip(weights.values).collect{|f,w| f * w }
76
+ end
72
77
  end
73
78
 
74
79
  class String
@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
74
74
  end
75
75
 
76
76
  def best(options = {})
77
- high, low, limit = {
78
- :low => 0,
79
- :high => 1,
80
- }.merge(options).
81
- values_at(:high, :low, :limit)
82
-
83
- num_docs = @num_docs.to_f
84
- best = df.select{|term, value|
85
- value >= low && value <= high
86
- }.collect{|p|
87
- term = p.first
88
- df_value = p.last
89
- [term,
90
- @terms[term].to_f / num_docs * Math::log(1.0/df_value)
91
- ]
92
- }
93
-
94
- if limit
95
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
96
- else
97
- Hash[*best.flatten]
98
- end
77
+ key = Misc.obj2digest(options)
78
+ @best ||= {}
79
+ @best[key] ||= begin
80
+ high, low, limit = {
81
+ :low => 0,
82
+ :high => 1,
83
+ }.merge(options).
84
+ values_at(:high, :low, :limit)
85
+
86
+ num_docs = @num_docs.to_f
87
+ best = df.select{|term, value|
88
+ value >= low && value <= high
89
+ }.collect{|p|
90
+ term = p.first
91
+ df_value = p.last
92
+ [term,
93
+ @terms[term].to_f / num_docs * Math::log(1.0/df_value)
94
+ ]
95
+ }
96
+
97
+ if limit
98
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
99
+ else
100
+ Hash[*best.flatten]
101
+ end
102
+ end
99
103
  end
100
104
 
101
105
  def weights(options = {})
@@ -68,6 +68,10 @@ module Transformed
68
68
 
69
69
  segments = [segments] unless Array === segments
70
70
  orig_length = self.length
71
+
72
+ offset = self.respond_to?(:offset) ? self.offset : 0
73
+ segments = segments.select{|s| s.offset >= offset && s.offset <= offset + self.length - 1 }
74
+
71
75
  Segment.clean_sort(segments).each do |segment|
72
76
  next if segment.offset.nil?
73
77
 
@@ -86,7 +90,7 @@ module Transformed
86
90
 
87
91
  updated_text = self[updated_begin..updated_end]
88
92
  if updated_text.nil?
89
- Log.warn "Range outside of segment: #{self.length} #{segment.locus} (#{updated_range})"
93
+ Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
90
94
  next
91
95
  end
92
96
 
@@ -101,6 +101,35 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
101
101
  assert_equal "CDK5R1 protein", exp2
102
102
  end
103
103
 
104
+ def test_with_transform_sentences
105
+ a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
106
+ original = a.dup
107
+
108
+ gene1 = "TP53"
109
+ gene1.extend NamedEntity
110
+ gene1.offset = a.index gene1
111
+
112
+ gene2 = "CDK5R1"
113
+ gene2.extend NamedEntity
114
+ gene2.offset = a.index gene2
115
+
116
+ bread = "Bread"
117
+ bread.extend NamedEntity
118
+ bread.offset = a.index bread
119
+
120
+ sentences = Segment.align(a, a.split(". "))
121
+
122
+ Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
123
+ assert sentences[1].include?("GN gene and the GN protein")
124
+ end
125
+
126
+ Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
127
+ assert sentences[0].include?("first sentence mentions BR")
128
+ end
129
+
130
+
131
+ end
132
+
104
133
  def test_html
105
134
  a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
106
135
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-18 00:00:00.000000000 Z
11
+ date: 2020-04-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util