rbbt-text 1.3.0 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -0
- data/lib/rbbt/bow/dictionary.rb +26 -22
- data/lib/rbbt/segment/transformed.rb +5 -1
- data/test/rbbt/segment/test_transformed.rb +29 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a6965ecde1b38d5bc93d4836ee6d757e2add39a51d64c2f06142bbbd303e22d7
|
4
|
+
data.tar.gz: a5c32ea03ea8214dd8c94ef6e884b59e459e3a7a8e3d26065a0a046b5b9b4778
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 756d240a796e5ac88b4b55368e0e4e3af14b3dd2d8b8b55e49839c3cdc3fa45ee807d648cf86b45b62e7f2f4d9e7fc15567ab21d3356e37a5c3c4316cbcaa841
|
7
|
+
data.tar.gz: 6caa03ec51185cac00cc436bac999b063fccfcc1dbf0e2c09359dad7171c0eea37f80436cc860038a2c1ad17eb9b67a03e88d1ae8ef406ce1c5c874d375d1abd
|
data/lib/rbbt/bow/bow.rb
CHANGED
@@ -69,6 +69,11 @@ module BagOfWords
|
|
69
69
|
count = bigrams ? count(bigrams(text)) : count(words(text))
|
70
70
|
count.values_at(*terms)
|
71
71
|
end
|
72
|
+
|
73
|
+
def self.weighted_features(text, weights)
|
74
|
+
features = features(text, weights.keys)
|
75
|
+
features.zip(weights.values).collect{|f,w| f * w }
|
76
|
+
end
|
72
77
|
end
|
73
78
|
|
74
79
|
class String
|
data/lib/rbbt/bow/dictionary.rb
CHANGED
@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
|
|
74
74
|
end
|
75
75
|
|
76
76
|
def best(options = {})
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
77
|
+
key = Misc.obj2digest(options)
|
78
|
+
@best ||= {}
|
79
|
+
@best[key] ||= begin
|
80
|
+
high, low, limit = {
|
81
|
+
:low => 0,
|
82
|
+
:high => 1,
|
83
|
+
}.merge(options).
|
84
|
+
values_at(:high, :low, :limit)
|
85
|
+
|
86
|
+
num_docs = @num_docs.to_f
|
87
|
+
best = df.select{|term, value|
|
88
|
+
value >= low && value <= high
|
89
|
+
}.collect{|p|
|
90
|
+
term = p.first
|
91
|
+
df_value = p.last
|
92
|
+
[term,
|
93
|
+
@terms[term].to_f / num_docs * Math::log(1.0/df_value)
|
94
|
+
]
|
95
|
+
}
|
96
|
+
|
97
|
+
if limit
|
98
|
+
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
|
99
|
+
else
|
100
|
+
Hash[*best.flatten]
|
101
|
+
end
|
102
|
+
end
|
99
103
|
end
|
100
104
|
|
101
105
|
def weights(options = {})
|
@@ -68,6 +68,10 @@ module Transformed
|
|
68
68
|
|
69
69
|
segments = [segments] unless Array === segments
|
70
70
|
orig_length = self.length
|
71
|
+
|
72
|
+
offset = self.respond_to?(:offset) ? self.offset : 0
|
73
|
+
segments = segments.select{|s| s.offset >= offset && s.offset <= offset + self.length - 1 }
|
74
|
+
|
71
75
|
Segment.clean_sort(segments).each do |segment|
|
72
76
|
next if segment.offset.nil?
|
73
77
|
|
@@ -86,7 +90,7 @@ module Transformed
|
|
86
90
|
|
87
91
|
updated_text = self[updated_begin..updated_end]
|
88
92
|
if updated_text.nil?
|
89
|
-
Log.warn "Range outside of segment: #{self.length} #{segment.
|
93
|
+
Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
|
90
94
|
next
|
91
95
|
end
|
92
96
|
|
@@ -101,6 +101,35 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
101
101
|
assert_equal "CDK5R1 protein", exp2
|
102
102
|
end
|
103
103
|
|
104
|
+
def test_with_transform_sentences
|
105
|
+
a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
|
+
original = a.dup
|
107
|
+
|
108
|
+
gene1 = "TP53"
|
109
|
+
gene1.extend NamedEntity
|
110
|
+
gene1.offset = a.index gene1
|
111
|
+
|
112
|
+
gene2 = "CDK5R1"
|
113
|
+
gene2.extend NamedEntity
|
114
|
+
gene2.offset = a.index gene2
|
115
|
+
|
116
|
+
bread = "Bread"
|
117
|
+
bread.extend NamedEntity
|
118
|
+
bread.offset = a.index bread
|
119
|
+
|
120
|
+
sentences = Segment.align(a, a.split(". "))
|
121
|
+
|
122
|
+
Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
|
123
|
+
assert sentences[1].include?("GN gene and the GN protein")
|
124
|
+
end
|
125
|
+
|
126
|
+
Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
|
127
|
+
assert sentences[0].include?("first sentence mentions BR")
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
end
|
132
|
+
|
104
133
|
def test_html
|
105
134
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
135
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-04-
|
11
|
+
date: 2020-04-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|