rbbt-text 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,24 +0,0 @@
1
- require 'rbbt/text/segment'
2
-
3
- module Relationship
4
- extend Annotation
5
- self.annotation :segment
6
- self.annotation :terms
7
- self.annotation :type
8
-
9
- def text
10
- if segment
11
- segment
12
- else
13
- type + ": " + terms * ", "
14
- end
15
- end
16
-
17
- def html
18
- text = <<-EOF
19
- <span class='Relationship'\
20
- >#{ self.text }</span>
21
- EOF
22
- text.chomp
23
- end
24
- end
@@ -1,49 +0,0 @@
1
- require 'rbbt/annotations'
2
- require 'rbbt/text/segment'
3
-
4
- module Token
5
- attr_accessor :offset, :original
6
-
7
- def self.all_annotations
8
- [:offset, :original]
9
- end
10
-
11
- def self.setup(text, start, original = nil)
12
- text.extend Token
13
- text.offset = start
14
- text.original = original
15
- text
16
- end
17
-
18
- def info
19
- {:original => original, :offset => offset}
20
- end
21
-
22
- def id
23
- Misc.hash2md5 info.merge :self => self
24
- end
25
-
26
- def end
27
- offset + self.length - 1
28
- end
29
-
30
- def range
31
- (offset..self.end)
32
- end
33
-
34
- def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
35
-
36
- tokens = []
37
- while matchdata = text.match(split_at)
38
- tokens << Token.setup(matchdata.pre_match, start) unless matchdata.pre_match.empty?
39
- tokens << Token.setup(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
40
- start += matchdata.end(0)
41
- text = matchdata.post_match
42
- end
43
-
44
- tokens << Token.setup(text, start) unless text.empty?
45
-
46
- tokens
47
- end
48
- end
49
-
@@ -1,33 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../../..', 'test_helper.rb')
2
- require 'rbbt/text/document'
3
- require 'rbbt/text/corpus'
4
- require 'rbbt/text/corpus/sources/pmid'
5
-
6
- class TestCorpusPMID < Test::Unit::TestCase
7
- def setup
8
- Log.severity = 0
9
- Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
10
-
11
- Corpus::Document.define :words do
12
- words = self.text.split(" ")
13
- Segment.align(self.text, words)
14
- end
15
-
16
- Corpus::Document.define :genes do
17
- require 'rbbt/ner/banner'
18
- Banner.new.match(self.text)
19
- end
20
-
21
- Corpus::Document.persist_in_global_tsv("genes")
22
- Corpus::Document.persist_in_global_tsv(:words)
23
- end
24
-
25
- def test_query
26
- docids = Document.corpus.add_pubmed_query("SARS-Cov-2", 2000, :abstract)
27
-
28
- docids.each do |docid|
29
- iif Document.corpus.docid(docid).text
30
- end
31
- end
32
- end
33
-
@@ -1,82 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/text/corpus/document'
3
-
4
- class TestCorpusDocument < Test::Unit::TestCase
5
- def setup
6
- Log.severity = 0
7
-
8
- Corpus::Document.define :words do
9
- words = self.text.split(" ")
10
- Segment.align(self.text, words)
11
- end
12
-
13
- Corpus::Document.define_multiple :words2 do |documents|
14
- documents.collect do |doc|
15
- words = doc.text.split(" ")
16
- Segment.align(doc.text, words)
17
- end
18
- end
19
-
20
- Open.mkdir Rbbt.tmp.test.annotations.find
21
-
22
- Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
23
- Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
24
- end
25
-
26
- def test_words
27
- text = "This is a test document"
28
- document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", text)
29
- assert_equal Segment.sort(document.words), text.split(" ")
30
- assert document.words.first.docid
31
- assert document.words.first.segment_id.include?("TEST")
32
- end
33
-
34
- def test_words_multiple
35
- document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
36
- document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
37
- document3 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc3:3", "This is yet another test document")
38
-
39
- docs = [document1, document2, document3]
40
-
41
- Corpus::Document.prepare_multiple(docs, :words2)
42
-
43
- assert document1.words.first.docid
44
- assert document1.words.first.segment_id.include?("TEST")
45
-
46
- assert_equal document1.words2, document1.text.split(" ")
47
- assert_equal document2.words2, document2.text.split(" ")
48
- assert_equal document3.words2, document3.text.split(" ")
49
-
50
- document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
51
- document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
52
-
53
- docs = [document1, document2]
54
-
55
- Corpus::Document.prepare_multiple(docs, :words2)
56
- end
57
-
58
- def test_parallel
59
- text =<<-EOF
60
- This is a test document number
61
- EOF
62
-
63
- docs = []
64
- 100.times do |i|
65
- docs << text.chomp + " " + i.to_s
66
- end
67
-
68
- Log.with_severity 0 do
69
- TSV.traverse docs, :cpus => 10, :bar => true do |doc|
70
- hash = Misc.digest(doc)
71
- document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
72
- assert_equal Segment.sort(document.words), document.text.split(" ")
73
- end
74
- TSV.traverse docs, :cpus => 10, :bar => true do |doc|
75
- hash = Misc.digest(doc)
76
- document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
77
- assert_equal Segment.sort(document.words), document.text.split(" ")
78
- end
79
- end
80
- end
81
- end
82
-
@@ -1,23 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/text/segment/segmented'
3
-
4
- class TestClass < Test::Unit::TestCase
5
- def test_split
6
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
7
-
8
- gene1 = "TP53"
9
- gene1.extend Segment
10
- gene1.offset = a.index gene1
11
-
12
- gene2 = "CDK5R1"
13
- gene2.extend Segment
14
- gene2.offset = a.index gene2
15
-
16
- gene3 = "TP53 gene"
17
- gene3.extend Segment
18
- gene3.offset = a.index gene3
19
-
20
- Segmented.setup(a, [gene2, gene1, gene3])
21
- assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], a.split_segments
22
- end
23
- end
@@ -1,34 +0,0 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- $LOAD_PATH.unshift(File.dirname(__FILE__))
3
- require 'test/unit'
4
- require 'rbbt-util'
5
- require 'rbbt/text/corpus'
6
-
7
- class Corpus::Document
8
-
9
- define :words do
10
- text.split(" ")
11
- end
12
- end
13
-
14
- class TestClass < Test::Unit::TestCase
15
- def test_document
16
- Log.severity = 0
17
- text = "This is a test document"
18
-
19
- docid = nil
20
- TmpFile.with_file do |dir|
21
- corpus = Corpus.new dir
22
- docid = corpus.add_document text, :TEST, :test_doc
23
- document = corpus.docid(docid)
24
- assert_equal text, document.text
25
-
26
- corpus = Corpus.new dir
27
- document = corpus.docid(docid)
28
- assert_equal text, document.text
29
- document = corpus.find(:TEST, :test_doc).first
30
- assert_equal text, document.text
31
- end
32
- end
33
- end
34
-
@@ -1,58 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
- require 'rbbt/text/document'
3
- require 'rbbt/text/corpus/sources/pmid'
4
-
5
- class TestDocument < Test::Unit::TestCase
6
- def setup
7
- Log.severity = 0
8
- Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
9
-
10
- Corpus::Document.define :words do
11
- words = self.text.split(" ")
12
- Segment.align(self.text, words)
13
- end
14
-
15
- Corpus::Document.define :genes do
16
- require 'rbbt/ner/banner'
17
- Banner.new.match(self.text)
18
- end
19
-
20
- Corpus::Document.persist_in_global_tsv("genes")
21
- Corpus::Document.persist_in_global_tsv(:words)
22
- end
23
-
24
- def test_title_and_text
25
- document = Document.setup('PMID:32272262')
26
-
27
- assert document.text.downcase.include?("covid")
28
- assert_equal "High-resolution Chest CT Features and Clinical Characteristics of Patients Infected with COVID-19 in Jiangsu, China.", document.title
29
- end
30
-
31
- def test_full_text
32
- document = Document.setup('PMID:4304705')
33
- assert document.text.length < document.full_text.length
34
- end
35
-
36
- def test_words
37
- document = Document.setup('PMID:32272262')
38
- words = document.entities :words
39
- assert words.first.respond_to?(:offset)
40
- end
41
-
42
- def test_genes
43
- text = "This is a mention to TP53, a gene that should be found"
44
- document = Document.setup(Document.corpus.add_document(text, "TEST"))
45
- genes = document.entities :genes
46
-
47
- assert_equal "TP53", genes.first
48
- assert genes.first.respond_to?(:offset)
49
-
50
- text = "This is a mention to TP53, a gene that should be found"
51
- document = Document.setup(Document.corpus.add_document(text, "TEST"))
52
- genes = document.entities :genes
53
-
54
- assert_equal "TP53", genes.first
55
- assert genes.first.respond_to?(:offset)
56
- end
57
- end
58
-
@@ -1,100 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
- require 'rbbt/text/segment'
3
-
4
- class TestSegment < Test::Unit::TestCase
5
- def test_info
6
- a = "test"
7
- a.extend Segment
8
- a.offset = 10
9
- assert a.info.include? :offset
10
- end
11
-
12
- def test_sort
13
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
14
-
15
- gene1 = "TP53"
16
- gene1.extend Segment
17
- gene1.offset = a.index gene1
18
-
19
- gene2 = "CDK5R1"
20
- gene2.extend Segment
21
- gene2.offset = a.index gene2
22
-
23
- assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
24
- end
25
-
26
- def test_clean_sort
27
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
28
-
29
- gene1 = "TP53"
30
- gene1.extend Segment
31
- gene1.offset = a.index gene1
32
-
33
- gene2 = "CDK5R1"
34
- gene2.extend Segment
35
- gene2.offset = a.index gene2
36
-
37
- gene3 = "TP53 gene"
38
- gene3.extend Segment
39
- gene3.offset = a.index gene3
40
-
41
- assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
42
- end
43
-
44
- def test_split
45
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
46
-
47
- gene1 = "TP53"
48
- gene1.extend Segment
49
- gene1.offset = a.index gene1
50
-
51
- gene2 = "CDK5R1"
52
- gene2.extend Segment
53
- gene2.offset = a.index gene2
54
-
55
- gene3 = "TP53 gene"
56
- gene3.extend Segment
57
- gene3.offset = a.index gene3
58
-
59
- assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(a, [gene2,gene1,gene3])
60
- end
61
-
62
-
63
- def test_align
64
- text =<<-EOF
65
- Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
66
- EOF
67
-
68
- parts = text.split(/\W/)
69
- Segment.align(text, parts)
70
-
71
- assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
72
- end
73
-
74
- def test_segment_index
75
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
76
-
77
- gene1 = "TP53"
78
- gene1.extend Segment
79
- gene1.offset = a.index gene1
80
-
81
- gene2 = "CDK5R1"
82
- gene2.extend Segment
83
- gene2.offset = a.index gene2
84
-
85
- gene3 = "TP53 gene"
86
- gene3.extend Segment
87
- gene3.offset = a.index gene3
88
-
89
- index = Segment.index([gene1, gene2, gene3])
90
- assert_equal %w(CDK5R1), index[gene2.offset + 1]
91
-
92
- TmpFile.with_file do |fwt|
93
- index = Segment.index([gene1, gene2, gene3], fwt)
94
- assert_equal %w(CDK5R1), index[gene2.offset + 1]
95
- index = Segment.index([gene1, gene2, gene3], fwt)
96
- assert_equal %w(CDK5R1), index[gene2.offset + 1]
97
- end
98
- end
99
- end
100
-