rbbt-text 1.3.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -0
- data/lib/rbbt/bow/dictionary.rb +26 -22
- data/lib/rbbt/segment/transformed.rb +5 -1
- data/test/rbbt/segment/test_transformed.rb +29 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a6965ecde1b38d5bc93d4836ee6d757e2add39a51d64c2f06142bbbd303e22d7
|
4
|
+
data.tar.gz: a5c32ea03ea8214dd8c94ef6e884b59e459e3a7a8e3d26065a0a046b5b9b4778
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 756d240a796e5ac88b4b55368e0e4e3af14b3dd2d8b8b55e49839c3cdc3fa45ee807d648cf86b45b62e7f2f4d9e7fc15567ab21d3356e37a5c3c4316cbcaa841
|
7
|
+
data.tar.gz: 6caa03ec51185cac00cc436bac999b063fccfcc1dbf0e2c09359dad7171c0eea37f80436cc860038a2c1ad17eb9b67a03e88d1ae8ef406ce1c5c874d375d1abd
|
data/lib/rbbt/bow/bow.rb
CHANGED
@@ -69,6 +69,11 @@ module BagOfWords
|
|
69
69
|
count = bigrams ? count(bigrams(text)) : count(words(text))
|
70
70
|
count.values_at(*terms)
|
71
71
|
end
|
72
|
+
|
73
|
+
def self.weighted_features(text, weights)
|
74
|
+
features = features(text, weights.keys)
|
75
|
+
features.zip(weights.values).collect{|f,w| f * w }
|
76
|
+
end
|
72
77
|
end
|
73
78
|
|
74
79
|
class String
|
data/lib/rbbt/bow/dictionary.rb
CHANGED
@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
|
|
74
74
|
end
|
75
75
|
|
76
76
|
def best(options = {})
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
77
|
+
key = Misc.obj2digest(options)
|
78
|
+
@best ||= {}
|
79
|
+
@best[key] ||= begin
|
80
|
+
high, low, limit = {
|
81
|
+
:low => 0,
|
82
|
+
:high => 1,
|
83
|
+
}.merge(options).
|
84
|
+
values_at(:high, :low, :limit)
|
85
|
+
|
86
|
+
num_docs = @num_docs.to_f
|
87
|
+
best = df.select{|term, value|
|
88
|
+
value >= low && value <= high
|
89
|
+
}.collect{|p|
|
90
|
+
term = p.first
|
91
|
+
df_value = p.last
|
92
|
+
[term,
|
93
|
+
@terms[term].to_f / num_docs * Math::log(1.0/df_value)
|
94
|
+
]
|
95
|
+
}
|
96
|
+
|
97
|
+
if limit
|
98
|
+
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
|
99
|
+
else
|
100
|
+
Hash[*best.flatten]
|
101
|
+
end
|
102
|
+
end
|
99
103
|
end
|
100
104
|
|
101
105
|
def weights(options = {})
|
@@ -68,6 +68,10 @@ module Transformed
|
|
68
68
|
|
69
69
|
segments = [segments] unless Array === segments
|
70
70
|
orig_length = self.length
|
71
|
+
|
72
|
+
offset = self.respond_to?(:offset) ? self.offset : 0
|
73
|
+
segments = segments.select{|s| s.offset >= offset && s.offset <= offset + self.length - 1 }
|
74
|
+
|
71
75
|
Segment.clean_sort(segments).each do |segment|
|
72
76
|
next if segment.offset.nil?
|
73
77
|
|
@@ -86,7 +90,7 @@ module Transformed
|
|
86
90
|
|
87
91
|
updated_text = self[updated_begin..updated_end]
|
88
92
|
if updated_text.nil?
|
89
|
-
Log.warn "Range outside of segment: #{self.length} #{segment.
|
93
|
+
Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
|
90
94
|
next
|
91
95
|
end
|
92
96
|
|
@@ -101,6 +101,35 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
101
101
|
assert_equal "CDK5R1 protein", exp2
|
102
102
|
end
|
103
103
|
|
104
|
+
def test_with_transform_sentences
|
105
|
+
a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
|
+
original = a.dup
|
107
|
+
|
108
|
+
gene1 = "TP53"
|
109
|
+
gene1.extend NamedEntity
|
110
|
+
gene1.offset = a.index gene1
|
111
|
+
|
112
|
+
gene2 = "CDK5R1"
|
113
|
+
gene2.extend NamedEntity
|
114
|
+
gene2.offset = a.index gene2
|
115
|
+
|
116
|
+
bread = "Bread"
|
117
|
+
bread.extend NamedEntity
|
118
|
+
bread.offset = a.index bread
|
119
|
+
|
120
|
+
sentences = Segment.align(a, a.split(". "))
|
121
|
+
|
122
|
+
Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
|
123
|
+
assert sentences[1].include?("GN gene and the GN protein")
|
124
|
+
end
|
125
|
+
|
126
|
+
Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
|
127
|
+
assert sentences[0].include?("first sentence mentions BR")
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
end
|
132
|
+
|
104
133
|
def test_html
|
105
134
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
135
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-04-
|
11
|
+
date: 2020-04-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|