rbbt-text 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,35 @@
1
+ module Segment::RangeIndex
2
+ attr_accessor :corpus
3
+
4
+ def [](*args)
5
+ res = super(*args)
6
+ SegID.setup(res, :corpus => corpus)
7
+ end
8
+
9
+ def self.index(segments, corpus, persist_file = :memory)
10
+ segments = segments.values.flatten if Hash === segments
11
+
12
+ annotation_index =
13
+ Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
14
+
15
+ value_size = 0
16
+ index_data = segments.collect{|segment|
17
+ next if segment.offset.nil?
18
+ range = segment.range
19
+ value_size = [segment.segid.length, value_size].max
20
+ [segment.segid, [range.begin, range.end]]
21
+ }.compact
22
+
23
+ fwt = FixWidthTable.get :memory, value_size, true
24
+ fwt.add_range index_data
25
+
26
+ fwt
27
+ end
28
+
29
+ annotation_index.extend Segment::RangeIndex
30
+ annotation_index.corpus = corpus
31
+ annotation_index
32
+ end
33
+
34
+ end
35
+
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/annotations'
2
- require 'rbbt/text/segment'
2
+ require 'rbbt/segment'
3
3
 
4
4
  module Segmented
5
5
  extend Annotation
@@ -0,0 +1,23 @@
1
+ require 'rbbt/segment'
2
+
3
+ module Token
4
+ extend Entity
5
+ include Segment
6
+
7
+ self.annotation :original
8
+
9
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
10
+
11
+ tokens = []
12
+ while matchdata = text.match(split_at)
13
+ tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty?
14
+ tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
15
+ start += matchdata.end(0)
16
+ text = matchdata.post_match
17
+ end
18
+
19
+ tokens << Token.setup(text, :offset => start) unless text.empty?
20
+
21
+ tokens
22
+ end
23
+ end
@@ -1,6 +1,3 @@
1
- require 'rbbt/util/misc'
2
- require 'rbbt/text/segment'
3
-
4
1
  module Transformed
5
2
 
6
3
  def self.transform(text, segments, replacement = nil, &block)
@@ -111,10 +108,10 @@ module Transformed
111
108
 
112
109
  self[updated_begin..updated_end] = new
113
110
 
114
- @transformed_segments[segment.segment_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
111
+ @transformed_segments[segment.object_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
115
112
 
116
113
  segment.replace original_text
117
- stack << segment.segment_id
114
+ stack << segment.object_id
118
115
  end
119
116
  @transformation_stack << stack
120
117
  end
@@ -122,13 +119,13 @@ module Transformed
122
119
  def fix_segment(segment, range, diff)
123
120
  case
124
121
  # Before
125
- when segment.end < range.begin
122
+ when segment.eend < range.begin
126
123
  # After
127
124
  when segment.offset.to_i > range.end + diff
128
125
  segment.offset = segment.offset.to_i - diff
129
126
  # Includes
130
- when (segment.offset.to_i <= range.begin and segment.end >= range.end + diff)
131
- segment.replace self[segment.offset.to_i..segment.end - diff]
127
+ when (segment.offset.to_i <= range.begin and segment.eend >= range.end + diff)
128
+ segment.replace self[segment.offset.to_i..segment.eend - diff]
132
129
  else
133
130
  raise "Segment Overlaps"
134
131
  end
@@ -141,7 +138,8 @@ module Transformed
141
138
 
142
139
  if first_only
143
140
  @transformation_stack.pop.reverse.each do |id|
144
- orig_range, diff, text, range = @transformed_segments.delete id
141
+ segment_info = @transformed_segments.delete id
142
+ orig_range, diff, text, range = segment_info
145
143
 
146
144
  new_range = (range.begin..range.last + diff)
147
145
  self[new_range] = text
@@ -0,0 +1,41 @@
1
+ #module Segment
2
+ #
3
+ # def self.set_tsv_fields(fields, segments)
4
+ # tsv_fields = []
5
+ # add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
6
+ # literal = (fields.delete(:literal) || fields.delete("literal"))
7
+ # tsv_fields << "Start" << "End"
8
+ # tsv_fields << :annotation_types if add_types
9
+ # tsv_fields << :literal if literal
10
+ #
11
+ # if fields.any? and not (fields == [:all] or fields == ["all"])
12
+ # tsv_fields.concat fields
13
+ # else
14
+ # tsv_fields.concat segments.first.annotations if segments.any?
15
+ # end
16
+ # tsv_fields
17
+ # tsv_fields.collect!{|f| f.to_s}
18
+ # tsv_fields.delete "offset"
19
+ # tsv_fields
20
+ # end
21
+ #
22
+ # def self.tsv(segments, *fields)
23
+ # fields = set_tsv_fields fields, segments
24
+ # tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
25
+ #
26
+ # segments.each do |segment|
27
+ # tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
28
+ # end
29
+ #
30
+ # tsv
31
+ # end
32
+ #
33
+ # def self.load_tsv(tsv)
34
+ # fields = tsv.fields
35
+ # tsv.with_unnamed do
36
+ # tsv.collect do |id, values|
37
+ # Annotated.load_tsv_values(id, values, fields)
38
+ # end
39
+ # end
40
+ # end
41
+ #end
@@ -12,7 +12,7 @@ pkg_dir="`opt_dir \"$name\"`"
12
12
  build_dir=`build_dir`
13
13
  mv "$build_dir" "$pkg_dir"
14
14
  tmp_file="~/.rbbt/tmp/species-proxy-properties.tmp"
15
- mkdir -p $(basename "$tmp_file")
15
+ mkdir -p $(dirname "$tmp_file")
16
16
  cat "$pkg_dir/species-proxy/properties.conf" |grep -v "^.dir =" >> $tmp_file
17
17
  echo "\$dir = $pkg_dir/species-proxy/" > "$pkg_dir/species-proxy/properties.conf"
18
18
  cat $tmp_file | grep -v "^#" >> "$pkg_dir/species-proxy/properties.conf"
@@ -0,0 +1,15 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/document/corpus/pubmed'
5
+
6
+ class TestCorpusPubmed < Test::Unit::TestCase
7
+ def test_add_pmid
8
+ corpus = Document::Corpus.setup({})
9
+
10
+ document = corpus.add_pmid("32299157", :abstract).first
11
+ title = document.to(:title)
12
+ assert title.include?("COVID-19")
13
+ end
14
+ end
15
+
@@ -0,0 +1,140 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/segment'
5
+ require 'rbbt/document/annotation'
6
+ require 'rbbt/segment/named_entity'
7
+
8
+ class TestAnnotation < Test::Unit::TestCase
9
+ class CalledOnce < Exception; end
10
+ def setup
11
+ Document.define :words do
12
+ self.split(" ")
13
+ end
14
+
15
+ $called_once = false
16
+ Document.define :persisted_words do
17
+ raise CalledOnce if $called_once
18
+ $called_once = true
19
+ self.split(" ")
20
+ end
21
+
22
+ Document.define_multiple :multiple_words do |list|
23
+ list.collect{|doc| doc.words}
24
+ end
25
+
26
+ Document.define :ner do
27
+ $called_once = true
28
+ self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
29
+ end
30
+
31
+ Document.persist :ner
32
+ end
33
+
34
+ def test_define
35
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
36
+ Document.setup(text, "TEST", "test_doc1", nil)
37
+
38
+ corpus = {}
39
+ corpus.extend Document::Corpus
40
+
41
+ corpus.add_document(text)
42
+
43
+ assert_equal text[text.words[1].range], text.words[1]
44
+ end
45
+
46
+ def test_define_multiple
47
+ text1 = "This sentence mentions the TP53 gene and the CDK5R1 protein"
48
+ text2 = "This is another sentence"
49
+ Document.setup(text1, "TEST", "test_doc1", nil)
50
+ Document.setup(text2, "TEST", "test_doc2", nil)
51
+
52
+ corpus = {}
53
+ corpus.extend Document::Corpus
54
+
55
+ corpus.add_document(text1)
56
+ corpus.add_document(text2)
57
+
58
+ assert_equal 2, Document.setup([text1, text2]).multiple_words.length
59
+ assert_equal text1.split(" "), text1.multiple_words
60
+
61
+ #Document.persist :multiple_words, :annotations, :annotation_repo => Rbbt.tmp.test.multiple_words
62
+ #assert_equal 2, Document.setup([text1, text2]).multiple_words.length
63
+ #assert_equal text1.split(" "), text1.multiple_words
64
+ end
65
+
66
+ def test_persist
67
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
68
+ Document.setup(text, "TEST", "test_doc1", nil)
69
+
70
+ corpus = {}
71
+ corpus.extend Document::Corpus
72
+
73
+ corpus.add_document(text)
74
+
75
+ assert_equal "persisted_words", text.persisted_words.first.type
76
+
77
+ assert_raise CalledOnce do
78
+ assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
79
+ end
80
+
81
+ Log.severity = 0
82
+ Document.persist :persisted_words, :annotations, :file => Rbbt.tmp.test.persisted_words.find(:user)
83
+
84
+ $called_once = false
85
+ text.persisted_words
86
+ assert $called_once
87
+
88
+ assert_nothing_raised do
89
+ assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
90
+ end
91
+ end
92
+
93
+ def test_persist_annotation_repo
94
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
95
+ Document.setup(text, "TEST", "test_doc1", nil)
96
+
97
+ corpus = {}
98
+ corpus.extend Document::Corpus
99
+
100
+ corpus.add_document(text)
101
+
102
+ assert_equal "persisted_words", text.persisted_words.first.type
103
+
104
+ assert_raise CalledOnce do
105
+ assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
106
+ end
107
+
108
+ Log.severity = 0
109
+ Document.persist :persisted_words, :annotations, :annotation_repo => Rbbt.tmp.test.persisted_words_repo.find(:user)
110
+
111
+ $called_once = false
112
+ text.persisted_words
113
+ assert $called_once
114
+
115
+ assert_nothing_raised do
116
+ assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
117
+ end
118
+ end
119
+
120
+ def test_persist_ner
121
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
122
+ Document.setup(text, "TEST", "test_doc1", nil)
123
+
124
+ corpus = {}
125
+ corpus.extend Document::Corpus
126
+
127
+ corpus.add_document(text)
128
+
129
+
130
+ text.ner
131
+
132
+ $called_once = false
133
+ text.ner
134
+
135
+ assert ! $called_once
136
+
137
+ assert text.ner.first.segid.include?("TEST:")
138
+ end
139
+ end
140
+
@@ -0,0 +1,33 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+
5
+ class TestDocumentCorpus < Test::Unit::TestCase
6
+ def test_corpus
7
+ text = "This is a document"
8
+ Document.setup(text, "TEST", "test_doc1", nil)
9
+
10
+ corpus = Document::Corpus.setup({})
11
+
12
+ corpus.add_document(text)
13
+
14
+ docid = text.docid(corpus)
15
+
16
+ assert_equal docid.document, text
17
+ end
18
+
19
+ def test_find
20
+ text = "This is a document"
21
+ Document.setup(text, "TEST", "test_doc1", nil)
22
+
23
+ TmpFile.with_file do |path|
24
+ corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
25
+ corpus.extend Document::Corpus
26
+
27
+ corpus.add_document(text)
28
+
29
+ assert corpus.prefix("TEST:").include?(text.docid)
30
+ end
31
+ end
32
+ end
33
+
@@ -8,13 +8,13 @@ require 'rbbt/sources/NCI'
8
8
 
9
9
  class TestFinder < Test::Unit::TestCase
10
10
 
11
- def test_namespace_and_format
11
+ def _test_namespace_and_format
12
12
  f = Finder.new(CMD.cmd("head -n 1000", :in => Open.open(Organism.identifiers(Organism.default_code("Hsa")).produce.find)))
13
13
  assert_equal Organism.default_code("Hsa"), f.instances.first.namespace
14
14
  assert_equal "Ensembl Gene ID", f.instances.first.format
15
15
  end
16
16
 
17
- def test_find
17
+ def _test_find
18
18
  f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["SF3B1"])
19
19
 
20
20
  assert_equal "ENSG00000115524", f.find("SF3B1").first
@@ -23,7 +23,7 @@ class TestFinder < Test::Unit::TestCase
23
23
  end
24
24
  end
25
25
 
26
- def test_find2
26
+ def _test_find2
27
27
  f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["RASGRF2"])
28
28
 
29
29
  m = f.find("RAS").first
@@ -9,7 +9,17 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
9
9
  EOF
10
10
 
11
11
  mentions = GNormPlus.process({:file => text})
12
- Log.tsv mentions
12
+ assert_equal 1, mentions.length
13
+ assert_equal 2, mentions["file"].length
14
+ end
15
+
16
+ def test_entities
17
+ text =<<-EOF
18
+ We found that TP53 is regulated by MDM2 in Homo sapiens
19
+ EOF
20
+
21
+ mentions = GNormPlus.entities({:file => text})
22
+ mentions["file"].include? "TP53"
13
23
  end
14
24
  end
15
25
 
@@ -2,17 +2,17 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
2
2
  require 'rbbt/ner/patterns'
3
3
 
4
4
  class TestPatternRelExt < Test::Unit::TestCase
5
- def test_simple_pattern
5
+ def _test_simple_pattern
6
6
  text = "Experiments have shown that TP53 interacts with CDK5 under certain conditions"
7
7
 
8
8
  gene1 = "TP53"
9
- NamedEntity.setup(gene1, text.index(gene1), "Gene")
9
+ NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
10
10
 
11
11
  gene2 = "CDK5"
12
- NamedEntity.setup(gene2, text.index(gene2), "Gene")
12
+ NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
13
13
 
14
14
  interaction = "interacts"
15
- NamedEntity.setup(interaction, text.index(interaction), "Interaction")
15
+ NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
16
16
 
17
17
  Segmented.setup(text, [gene1, gene2, interaction])
18
18
 
@@ -23,13 +23,13 @@ class TestPatternRelExt < Test::Unit::TestCase
23
23
  text = "Experiments have shown that TP53 found in cultivated cells interacts with CDK5 under certain conditions"
24
24
 
25
25
  gene1 = "TP53"
26
- NamedEntity.setup(gene1, text.index(gene1), "Gene")
26
+ NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
27
27
 
28
28
  gene2 = "CDK5"
29
- NamedEntity.setup(gene2, text.index(gene2), "Gene")
29
+ NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
30
30
 
31
31
  interaction = "interacts"
32
- NamedEntity.setup(interaction, text.index(interaction), "Interaction")
32
+ NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
33
33
 
34
34
  Segmented.setup(text, {:entities => [gene1, gene2, interaction]})
35
35
 
@@ -40,7 +40,7 @@ class TestPatternRelExt < Test::Unit::TestCase
40
40
  PatternRelExt.new(["NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]"]).match_sentences([text]).first.first
41
41
  end
42
42
 
43
- def test_chunk_pattern
43
+ def _test_chunk_pattern
44
44
  text = "There is a concern with the use of thiazolidinediones in patients with an increased risk of colon cancer (e.g., familial colon polyposis)."
45
45
 
46
46
  drug = "thiazolidinediones"
@@ -57,7 +57,7 @@ class TestPatternRelExt < Test::Unit::TestCase
57
57
  end
58
58
 
59
59
 
60
- def test_entities_with_spaces
60
+ def _test_entities_with_spaces
61
61
  PatternRelExt.new("NP[entity:Gene Name]").token_trie
62
62
  end
63
63