rbbt-text 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,24 +0,0 @@
1
- require 'rbbt/text/segment'
2
-
3
- module Relationship
4
- extend Annotation
5
- self.annotation :segment
6
- self.annotation :terms
7
- self.annotation :type
8
-
9
- def text
10
- if segment
11
- segment
12
- else
13
- type + ": " + terms * ", "
14
- end
15
- end
16
-
17
- def html
18
- text = <<-EOF
19
- <span class='Relationship'\
20
- >#{ self.text }</span>
21
- EOF
22
- text.chomp
23
- end
24
- end
@@ -1,49 +0,0 @@
1
- require 'rbbt/annotations'
2
- require 'rbbt/text/segment'
3
-
4
- module Token
5
- attr_accessor :offset, :original
6
-
7
- def self.all_annotations
8
- [:offset, :original]
9
- end
10
-
11
- def self.setup(text, start, original = nil)
12
- text.extend Token
13
- text.offset = start
14
- text.original = original
15
- text
16
- end
17
-
18
- def info
19
- {:original => original, :offset => offset}
20
- end
21
-
22
- def id
23
- Misc.hash2md5 info.merge :self => self
24
- end
25
-
26
- def end
27
- offset + self.length - 1
28
- end
29
-
30
- def range
31
- (offset..self.end)
32
- end
33
-
34
- def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
35
-
36
- tokens = []
37
- while matchdata = text.match(split_at)
38
- tokens << Token.setup(matchdata.pre_match, start) unless matchdata.pre_match.empty?
39
- tokens << Token.setup(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
40
- start += matchdata.end(0)
41
- text = matchdata.post_match
42
- end
43
-
44
- tokens << Token.setup(text, start) unless text.empty?
45
-
46
- tokens
47
- end
48
- end
49
-
@@ -1,33 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../../..', 'test_helper.rb')
2
- require 'rbbt/text/document'
3
- require 'rbbt/text/corpus'
4
- require 'rbbt/text/corpus/sources/pmid'
5
-
6
- class TestCorpusPMID < Test::Unit::TestCase
7
- def setup
8
- Log.severity = 0
9
- Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
10
-
11
- Corpus::Document.define :words do
12
- words = self.text.split(" ")
13
- Segment.align(self.text, words)
14
- end
15
-
16
- Corpus::Document.define :genes do
17
- require 'rbbt/ner/banner'
18
- Banner.new.match(self.text)
19
- end
20
-
21
- Corpus::Document.persist_in_global_tsv("genes")
22
- Corpus::Document.persist_in_global_tsv(:words)
23
- end
24
-
25
- def test_query
26
- docids = Document.corpus.add_pubmed_query("SARS-Cov-2", 2000, :abstract)
27
-
28
- docids.each do |docid|
29
- iif Document.corpus.docid(docid).text
30
- end
31
- end
32
- end
33
-
@@ -1,82 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/text/corpus/document'
3
-
4
- class TestCorpusDocument < Test::Unit::TestCase
5
- def setup
6
- Log.severity = 0
7
-
8
- Corpus::Document.define :words do
9
- words = self.text.split(" ")
10
- Segment.align(self.text, words)
11
- end
12
-
13
- Corpus::Document.define_multiple :words2 do |documents|
14
- documents.collect do |doc|
15
- words = doc.text.split(" ")
16
- Segment.align(doc.text, words)
17
- end
18
- end
19
-
20
- Open.mkdir Rbbt.tmp.test.annotations.find
21
-
22
- Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
23
- Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
24
- end
25
-
26
- def test_words
27
- text = "This is a test document"
28
- document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", text)
29
- assert_equal Segment.sort(document.words), text.split(" ")
30
- assert document.words.first.docid
31
- assert document.words.first.segment_id.include?("TEST")
32
- end
33
-
34
- def test_words_multiple
35
- document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
36
- document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
37
- document3 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc3:3", "This is yet another test document")
38
-
39
- docs = [document1, document2, document3]
40
-
41
- Corpus::Document.prepare_multiple(docs, :words2)
42
-
43
- assert document1.words.first.docid
44
- assert document1.words.first.segment_id.include?("TEST")
45
-
46
- assert_equal document1.words2, document1.text.split(" ")
47
- assert_equal document2.words2, document2.text.split(" ")
48
- assert_equal document3.words2, document3.text.split(" ")
49
-
50
- document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
51
- document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
52
-
53
- docs = [document1, document2]
54
-
55
- Corpus::Document.prepare_multiple(docs, :words2)
56
- end
57
-
58
- def test_parallel
59
- text =<<-EOF
60
- This is a test document number
61
- EOF
62
-
63
- docs = []
64
- 100.times do |i|
65
- docs << text.chomp + " " + i.to_s
66
- end
67
-
68
- Log.with_severity 0 do
69
- TSV.traverse docs, :cpus => 10, :bar => true do |doc|
70
- hash = Misc.digest(doc)
71
- document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
72
- assert_equal Segment.sort(document.words), document.text.split(" ")
73
- end
74
- TSV.traverse docs, :cpus => 10, :bar => true do |doc|
75
- hash = Misc.digest(doc)
76
- document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
77
- assert_equal Segment.sort(document.words), document.text.split(" ")
78
- end
79
- end
80
- end
81
- end
82
-
@@ -1,23 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/text/segment/segmented'
3
-
4
- class TestClass < Test::Unit::TestCase
5
- def test_split
6
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
7
-
8
- gene1 = "TP53"
9
- gene1.extend Segment
10
- gene1.offset = a.index gene1
11
-
12
- gene2 = "CDK5R1"
13
- gene2.extend Segment
14
- gene2.offset = a.index gene2
15
-
16
- gene3 = "TP53 gene"
17
- gene3.extend Segment
18
- gene3.offset = a.index gene3
19
-
20
- Segmented.setup(a, [gene2, gene1, gene3])
21
- assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], a.split_segments
22
- end
23
- end
@@ -1,34 +0,0 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- $LOAD_PATH.unshift(File.dirname(__FILE__))
3
- require 'test/unit'
4
- require 'rbbt-util'
5
- require 'rbbt/text/corpus'
6
-
7
- class Corpus::Document
8
-
9
- define :words do
10
- text.split(" ")
11
- end
12
- end
13
-
14
- class TestClass < Test::Unit::TestCase
15
- def test_document
16
- Log.severity = 0
17
- text = "This is a test document"
18
-
19
- docid = nil
20
- TmpFile.with_file do |dir|
21
- corpus = Corpus.new dir
22
- docid = corpus.add_document text, :TEST, :test_doc
23
- document = corpus.docid(docid)
24
- assert_equal text, document.text
25
-
26
- corpus = Corpus.new dir
27
- document = corpus.docid(docid)
28
- assert_equal text, document.text
29
- document = corpus.find(:TEST, :test_doc).first
30
- assert_equal text, document.text
31
- end
32
- end
33
- end
34
-
@@ -1,58 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
- require 'rbbt/text/document'
3
- require 'rbbt/text/corpus/sources/pmid'
4
-
5
- class TestDocument < Test::Unit::TestCase
6
- def setup
7
- Log.severity = 0
8
- Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
9
-
10
- Corpus::Document.define :words do
11
- words = self.text.split(" ")
12
- Segment.align(self.text, words)
13
- end
14
-
15
- Corpus::Document.define :genes do
16
- require 'rbbt/ner/banner'
17
- Banner.new.match(self.text)
18
- end
19
-
20
- Corpus::Document.persist_in_global_tsv("genes")
21
- Corpus::Document.persist_in_global_tsv(:words)
22
- end
23
-
24
- def test_title_and_text
25
- document = Document.setup('PMID:32272262')
26
-
27
- assert document.text.downcase.include?("covid")
28
- assert_equal "High-resolution Chest CT Features and Clinical Characteristics of Patients Infected with COVID-19 in Jiangsu, China.", document.title
29
- end
30
-
31
- def test_full_text
32
- document = Document.setup('PMID:4304705')
33
- assert document.text.length < document.full_text.length
34
- end
35
-
36
- def test_words
37
- document = Document.setup('PMID:32272262')
38
- words = document.entities :words
39
- assert words.first.respond_to?(:offset)
40
- end
41
-
42
- def test_genes
43
- text = "This is a mention to TP53, a gene that should be found"
44
- document = Document.setup(Document.corpus.add_document(text, "TEST"))
45
- genes = document.entities :genes
46
-
47
- assert_equal "TP53", genes.first
48
- assert genes.first.respond_to?(:offset)
49
-
50
- text = "This is a mention to TP53, a gene that should be found"
51
- document = Document.setup(Document.corpus.add_document(text, "TEST"))
52
- genes = document.entities :genes
53
-
54
- assert_equal "TP53", genes.first
55
- assert genes.first.respond_to?(:offset)
56
- end
57
- end
58
-
@@ -1,100 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
- require 'rbbt/text/segment'
3
-
4
- class TestSegment < Test::Unit::TestCase
5
- def test_info
6
- a = "test"
7
- a.extend Segment
8
- a.offset = 10
9
- assert a.info.include? :offset
10
- end
11
-
12
- def test_sort
13
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
14
-
15
- gene1 = "TP53"
16
- gene1.extend Segment
17
- gene1.offset = a.index gene1
18
-
19
- gene2 = "CDK5R1"
20
- gene2.extend Segment
21
- gene2.offset = a.index gene2
22
-
23
- assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
24
- end
25
-
26
- def test_clean_sort
27
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
28
-
29
- gene1 = "TP53"
30
- gene1.extend Segment
31
- gene1.offset = a.index gene1
32
-
33
- gene2 = "CDK5R1"
34
- gene2.extend Segment
35
- gene2.offset = a.index gene2
36
-
37
- gene3 = "TP53 gene"
38
- gene3.extend Segment
39
- gene3.offset = a.index gene3
40
-
41
- assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
42
- end
43
-
44
- def test_split
45
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
46
-
47
- gene1 = "TP53"
48
- gene1.extend Segment
49
- gene1.offset = a.index gene1
50
-
51
- gene2 = "CDK5R1"
52
- gene2.extend Segment
53
- gene2.offset = a.index gene2
54
-
55
- gene3 = "TP53 gene"
56
- gene3.extend Segment
57
- gene3.offset = a.index gene3
58
-
59
- assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(a, [gene2,gene1,gene3])
60
- end
61
-
62
-
63
- def test_align
64
- text =<<-EOF
65
- Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
66
- EOF
67
-
68
- parts = text.split(/\W/)
69
- Segment.align(text, parts)
70
-
71
- assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
72
- end
73
-
74
- def test_segment_index
75
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
76
-
77
- gene1 = "TP53"
78
- gene1.extend Segment
79
- gene1.offset = a.index gene1
80
-
81
- gene2 = "CDK5R1"
82
- gene2.extend Segment
83
- gene2.offset = a.index gene2
84
-
85
- gene3 = "TP53 gene"
86
- gene3.extend Segment
87
- gene3.offset = a.index gene3
88
-
89
- index = Segment.index([gene1, gene2, gene3])
90
- assert_equal %w(CDK5R1), index[gene2.offset + 1]
91
-
92
- TmpFile.with_file do |fwt|
93
- index = Segment.index([gene1, gene2, gene3], fwt)
94
- assert_equal %w(CDK5R1), index[gene2.offset + 1]
95
- index = Segment.index([gene1, gene2, gene3], fwt)
96
- assert_equal %w(CDK5R1), index[gene2.offset + 1]
97
- end
98
- end
99
- end
100
-