rbbt-text 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,35 @@
1
+ module Segment::RangeIndex
2
+ attr_accessor :corpus
3
+
4
+ def [](*args)
5
+ res = super(*args)
6
+ SegID.setup(res, :corpus => corpus)
7
+ end
8
+
9
+ def self.index(segments, corpus, persist_file = :memory)
10
+ segments = segments.values.flatten if Hash === segments
11
+
12
+ annotation_index =
13
+ Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
14
+
15
+ value_size = 0
16
+ index_data = segments.collect{|segment|
17
+ next if segment.offset.nil?
18
+ range = segment.range
19
+ value_size = [segment.segid.length, value_size].max
20
+ [segment.segid, [range.begin, range.end]]
21
+ }.compact
22
+
23
+ fwt = FixWidthTable.get :memory, value_size, true
24
+ fwt.add_range index_data
25
+
26
+ fwt
27
+ end
28
+
29
+ annotation_index.extend Segment::RangeIndex
30
+ annotation_index.corpus = corpus
31
+ annotation_index
32
+ end
33
+
34
+ end
35
+
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/annotations'
2
- require 'rbbt/text/segment'
2
+ require 'rbbt/segment'
3
3
 
4
4
  module Segmented
5
5
  extend Annotation
@@ -0,0 +1,23 @@
1
+ require 'rbbt/segment'
2
+
3
+ module Token
4
+ extend Entity
5
+ include Segment
6
+
7
+ self.annotation :original
8
+
9
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
10
+
11
+ tokens = []
12
+ while matchdata = text.match(split_at)
13
+ tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty?
14
+ tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
15
+ start += matchdata.end(0)
16
+ text = matchdata.post_match
17
+ end
18
+
19
+ tokens << Token.setup(text, :offset => start) unless text.empty?
20
+
21
+ tokens
22
+ end
23
+ end
@@ -1,6 +1,3 @@
1
- require 'rbbt/util/misc'
2
- require 'rbbt/text/segment'
3
-
4
1
  module Transformed
5
2
 
6
3
  def self.transform(text, segments, replacement = nil, &block)
@@ -111,10 +108,10 @@ module Transformed
111
108
 
112
109
  self[updated_begin..updated_end] = new
113
110
 
114
- @transformed_segments[segment.segment_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
111
+ @transformed_segments[segment.object_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
115
112
 
116
113
  segment.replace original_text
117
- stack << segment.segment_id
114
+ stack << segment.object_id
118
115
  end
119
116
  @transformation_stack << stack
120
117
  end
@@ -122,13 +119,13 @@ module Transformed
122
119
  def fix_segment(segment, range, diff)
123
120
  case
124
121
  # Before
125
- when segment.end < range.begin
122
+ when segment.eend < range.begin
126
123
  # After
127
124
  when segment.offset.to_i > range.end + diff
128
125
  segment.offset = segment.offset.to_i - diff
129
126
  # Includes
130
- when (segment.offset.to_i <= range.begin and segment.end >= range.end + diff)
131
- segment.replace self[segment.offset.to_i..segment.end - diff]
127
+ when (segment.offset.to_i <= range.begin and segment.eend >= range.end + diff)
128
+ segment.replace self[segment.offset.to_i..segment.eend - diff]
132
129
  else
133
130
  raise "Segment Overlaps"
134
131
  end
@@ -141,7 +138,8 @@ module Transformed
141
138
 
142
139
  if first_only
143
140
  @transformation_stack.pop.reverse.each do |id|
144
- orig_range, diff, text, range = @transformed_segments.delete id
141
+ segment_info = @transformed_segments.delete id
142
+ orig_range, diff, text, range = segment_info
145
143
 
146
144
  new_range = (range.begin..range.last + diff)
147
145
  self[new_range] = text
@@ -0,0 +1,41 @@
1
+ #module Segment
2
+ #
3
+ # def self.set_tsv_fields(fields, segments)
4
+ # tsv_fields = []
5
+ # add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
6
+ # literal = (fields.delete(:literal) || fields.delete("literal"))
7
+ # tsv_fields << "Start" << "End"
8
+ # tsv_fields << :annotation_types if add_types
9
+ # tsv_fields << :literal if literal
10
+ #
11
+ # if fields.any? and not (fields == [:all] or fields == ["all"])
12
+ # tsv_fields.concat fields
13
+ # else
14
+ # tsv_fields.concat segments.first.annotations if segments.any?
15
+ # end
16
+ # tsv_fields
17
+ # tsv_fields.collect!{|f| f.to_s}
18
+ # tsv_fields.delete "offset"
19
+ # tsv_fields
20
+ # end
21
+ #
22
+ # def self.tsv(segments, *fields)
23
+ # fields = set_tsv_fields fields, segments
24
+ # tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
25
+ #
26
+ # segments.each do |segment|
27
+ # tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
28
+ # end
29
+ #
30
+ # tsv
31
+ # end
32
+ #
33
+ # def self.load_tsv(tsv)
34
+ # fields = tsv.fields
35
+ # tsv.with_unnamed do
36
+ # tsv.collect do |id, values|
37
+ # Annotated.load_tsv_values(id, values, fields)
38
+ # end
39
+ # end
40
+ # end
41
+ #end
@@ -12,7 +12,7 @@ pkg_dir="`opt_dir \"$name\"`"
12
12
  build_dir=`build_dir`
13
13
  mv "$build_dir" "$pkg_dir"
14
14
  tmp_file="~/.rbbt/tmp/species-proxy-properties.tmp"
15
- mkdir -p $(basename "$tmp_file")
15
+ mkdir -p $(dirname "$tmp_file")
16
16
  cat "$pkg_dir/species-proxy/properties.conf" |grep -v "^.dir =" >> $tmp_file
17
17
  echo "\$dir = $pkg_dir/species-proxy/" > "$pkg_dir/species-proxy/properties.conf"
18
18
  cat $tmp_file | grep -v "^#" >> "$pkg_dir/species-proxy/properties.conf"
@@ -0,0 +1,15 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/document/corpus/pubmed'
5
+
6
+ class TestCorpusPubmed < Test::Unit::TestCase
7
+ def test_add_pmid
8
+ corpus = Document::Corpus.setup({})
9
+
10
+ document = corpus.add_pmid("32299157", :abstract).first
11
+ title = document.to(:title)
12
+ assert title.include?("COVID-19")
13
+ end
14
+ end
15
+
@@ -0,0 +1,140 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/segment'
5
+ require 'rbbt/document/annotation'
6
+ require 'rbbt/segment/named_entity'
7
+
8
+ class TestAnnotation < Test::Unit::TestCase
9
+ class CalledOnce < Exception; end
10
+ def setup
11
+ Document.define :words do
12
+ self.split(" ")
13
+ end
14
+
15
+ $called_once = false
16
+ Document.define :persisted_words do
17
+ raise CalledOnce if $called_once
18
+ $called_once = true
19
+ self.split(" ")
20
+ end
21
+
22
+ Document.define_multiple :multiple_words do |list|
23
+ list.collect{|doc| doc.words}
24
+ end
25
+
26
+ Document.define :ner do
27
+ $called_once = true
28
+ self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
29
+ end
30
+
31
+ Document.persist :ner
32
+ end
33
+
34
+ def test_define
35
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
36
+ Document.setup(text, "TEST", "test_doc1", nil)
37
+
38
+ corpus = {}
39
+ corpus.extend Document::Corpus
40
+
41
+ corpus.add_document(text)
42
+
43
+ assert_equal text[text.words[1].range], text.words[1]
44
+ end
45
+
46
+ def test_define_multiple
47
+ text1 = "This sentence mentions the TP53 gene and the CDK5R1 protein"
48
+ text2 = "This is another sentence"
49
+ Document.setup(text1, "TEST", "test_doc1", nil)
50
+ Document.setup(text2, "TEST", "test_doc2", nil)
51
+
52
+ corpus = {}
53
+ corpus.extend Document::Corpus
54
+
55
+ corpus.add_document(text1)
56
+ corpus.add_document(text2)
57
+
58
+ assert_equal 2, Document.setup([text1, text2]).multiple_words.length
59
+ assert_equal text1.split(" "), text1.multiple_words
60
+
61
+ #Document.persist :multiple_words, :annotations, :annotation_repo => Rbbt.tmp.test.multiple_words
62
+ #assert_equal 2, Document.setup([text1, text2]).multiple_words.length
63
+ #assert_equal text1.split(" "), text1.multiple_words
64
+ end
65
+
66
+ def test_persist
67
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
68
+ Document.setup(text, "TEST", "test_doc1", nil)
69
+
70
+ corpus = {}
71
+ corpus.extend Document::Corpus
72
+
73
+ corpus.add_document(text)
74
+
75
+ assert_equal "persisted_words", text.persisted_words.first.type
76
+
77
+ assert_raise CalledOnce do
78
+ assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
79
+ end
80
+
81
+ Log.severity = 0
82
+ Document.persist :persisted_words, :annotations, :file => Rbbt.tmp.test.persisted_words.find(:user)
83
+
84
+ $called_once = false
85
+ text.persisted_words
86
+ assert $called_once
87
+
88
+ assert_nothing_raised do
89
+ assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
90
+ end
91
+ end
92
+
93
+ def test_persist_annotation_repo
94
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
95
+ Document.setup(text, "TEST", "test_doc1", nil)
96
+
97
+ corpus = {}
98
+ corpus.extend Document::Corpus
99
+
100
+ corpus.add_document(text)
101
+
102
+ assert_equal "persisted_words", text.persisted_words.first.type
103
+
104
+ assert_raise CalledOnce do
105
+ assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
106
+ end
107
+
108
+ Log.severity = 0
109
+ Document.persist :persisted_words, :annotations, :annotation_repo => Rbbt.tmp.test.persisted_words_repo.find(:user)
110
+
111
+ $called_once = false
112
+ text.persisted_words
113
+ assert $called_once
114
+
115
+ assert_nothing_raised do
116
+ assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
117
+ end
118
+ end
119
+
120
+ def test_persist_ner
121
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
122
+ Document.setup(text, "TEST", "test_doc1", nil)
123
+
124
+ corpus = {}
125
+ corpus.extend Document::Corpus
126
+
127
+ corpus.add_document(text)
128
+
129
+
130
+ text.ner
131
+
132
+ $called_once = false
133
+ text.ner
134
+
135
+ assert ! $called_once
136
+
137
+ assert text.ner.first.segid.include?("TEST:")
138
+ end
139
+ end
140
+
@@ -0,0 +1,33 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+
5
+ class TestDocumentCorpus < Test::Unit::TestCase
6
+ def test_corpus
7
+ text = "This is a document"
8
+ Document.setup(text, "TEST", "test_doc1", nil)
9
+
10
+ corpus = Document::Corpus.setup({})
11
+
12
+ corpus.add_document(text)
13
+
14
+ docid = text.docid(corpus)
15
+
16
+ assert_equal docid.document, text
17
+ end
18
+
19
+ def test_find
20
+ text = "This is a document"
21
+ Document.setup(text, "TEST", "test_doc1", nil)
22
+
23
+ TmpFile.with_file do |path|
24
+ corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
25
+ corpus.extend Document::Corpus
26
+
27
+ corpus.add_document(text)
28
+
29
+ assert corpus.prefix("TEST:").include?(text.docid)
30
+ end
31
+ end
32
+ end
33
+
@@ -8,13 +8,13 @@ require 'rbbt/sources/NCI'
8
8
 
9
9
  class TestFinder < Test::Unit::TestCase
10
10
 
11
- def test_namespace_and_format
11
+ def _test_namespace_and_format
12
12
  f = Finder.new(CMD.cmd("head -n 1000", :in => Open.open(Organism.identifiers(Organism.default_code("Hsa")).produce.find)))
13
13
  assert_equal Organism.default_code("Hsa"), f.instances.first.namespace
14
14
  assert_equal "Ensembl Gene ID", f.instances.first.format
15
15
  end
16
16
 
17
- def test_find
17
+ def _test_find
18
18
  f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["SF3B1"])
19
19
 
20
20
  assert_equal "ENSG00000115524", f.find("SF3B1").first
@@ -23,7 +23,7 @@ class TestFinder < Test::Unit::TestCase
23
23
  end
24
24
  end
25
25
 
26
- def test_find2
26
+ def _test_find2
27
27
  f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["RASGRF2"])
28
28
 
29
29
  m = f.find("RAS").first
@@ -9,7 +9,17 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
9
9
  EOF
10
10
 
11
11
  mentions = GNormPlus.process({:file => text})
12
- Log.tsv mentions
12
+ assert_equal 1, mentions.length
13
+ assert_equal 2, mentions["file"].length
14
+ end
15
+
16
+ def test_entities
17
+ text =<<-EOF
18
+ We found that TP53 is regulated by MDM2 in Homo sapiens
19
+ EOF
20
+
21
+ mentions = GNormPlus.entities({:file => text})
22
+ mentions["file"].include? "TP53"
13
23
  end
14
24
  end
15
25
 
@@ -2,17 +2,17 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
2
2
  require 'rbbt/ner/patterns'
3
3
 
4
4
  class TestPatternRelExt < Test::Unit::TestCase
5
- def test_simple_pattern
5
+ def _test_simple_pattern
6
6
  text = "Experiments have shown that TP53 interacts with CDK5 under certain conditions"
7
7
 
8
8
  gene1 = "TP53"
9
- NamedEntity.setup(gene1, text.index(gene1), "Gene")
9
+ NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
10
10
 
11
11
  gene2 = "CDK5"
12
- NamedEntity.setup(gene2, text.index(gene2), "Gene")
12
+ NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
13
13
 
14
14
  interaction = "interacts"
15
- NamedEntity.setup(interaction, text.index(interaction), "Interaction")
15
+ NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
16
16
 
17
17
  Segmented.setup(text, [gene1, gene2, interaction])
18
18
 
@@ -23,13 +23,13 @@ class TestPatternRelExt < Test::Unit::TestCase
23
23
  text = "Experiments have shown that TP53 found in cultivated cells interacts with CDK5 under certain conditions"
24
24
 
25
25
  gene1 = "TP53"
26
- NamedEntity.setup(gene1, text.index(gene1), "Gene")
26
+ NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
27
27
 
28
28
  gene2 = "CDK5"
29
- NamedEntity.setup(gene2, text.index(gene2), "Gene")
29
+ NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
30
30
 
31
31
  interaction = "interacts"
32
- NamedEntity.setup(interaction, text.index(interaction), "Interaction")
32
+ NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
33
33
 
34
34
  Segmented.setup(text, {:entities => [gene1, gene2, interaction]})
35
35
 
@@ -40,7 +40,7 @@ class TestPatternRelExt < Test::Unit::TestCase
40
40
  PatternRelExt.new(["NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]"]).match_sentences([text]).first.first
41
41
  end
42
42
 
43
- def test_chunk_pattern
43
+ def _test_chunk_pattern
44
44
  text = "There is a concern with the use of thiazolidinediones in patients with an increased risk of colon cancer (e.g., familial colon polyposis)."
45
45
 
46
46
  drug = "thiazolidinediones"
@@ -57,7 +57,7 @@ class TestPatternRelExt < Test::Unit::TestCase
57
57
  end
58
58
 
59
59
 
60
- def test_entities_with_spaces
60
+ def _test_entities_with_spaces
61
61
  PatternRelExt.new("NP[entity:Gene Name]").token_trie
62
62
  end
63
63