rbbt-text 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,14 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+
4
+ class TestDocument < Test::Unit::TestCase
5
+
6
+ def test_docid
7
+ text = "This is a document"
8
+ Document.setup(text, "TEST", "test_doc1", nil)
9
+
10
+ assert_equal ["TEST", "test_doc1", nil, Misc.digest(text)] * ":", text.docid
11
+ end
12
+
13
+ end
14
+
@@ -0,0 +1,187 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/segment'
5
+
6
+ class TestSegment < Test::Unit::TestCase
7
+ def test_segment
8
+ text = "This is a document"
9
+ Document.setup(text, "TEST", "test_doc1", nil)
10
+
11
+ segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
12
+
13
+ assert_equal text.docid + ":" + segment.offset.to_s + ".." + segment.eend.to_s, segment.segid
14
+ end
15
+
16
+ def test_segid
17
+ text = "This is a document"
18
+ Document.setup(text, "TEST", "test_doc1", nil)
19
+
20
+ corpus = {}
21
+ corpus.extend Document::Corpus
22
+
23
+ corpus.add_document(text)
24
+
25
+ segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
26
+
27
+ segid = segment.segid(corpus)
28
+
29
+ segment = segid.segment
30
+ assert_equal "is", segment
31
+ end
32
+
33
+ def test_info
34
+ segment = "test"
35
+ segment.extend Segment
36
+ segment.offset = 10
37
+ assert segment.info.include? :offset
38
+ end
39
+
40
+ def test_sort
41
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
42
+ Document.setup(text, "TEST", "test_doc1", nil)
43
+
44
+ corpus = {}
45
+ corpus.extend Document::Corpus
46
+
47
+ corpus.add_document(text)
48
+
49
+ gene1 = "TP53"
50
+ gene1.extend Segment
51
+ gene1.offset = text.index gene1
52
+ gene1.docid = text.docid
53
+
54
+ gene2 = "CDK5R1"
55
+ gene2.extend Segment
56
+ gene2.offset = text.index gene2
57
+ gene2.docid = text.docid
58
+
59
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
60
+
61
+ assert_equal [gene1,gene2], Segment.sort([gene2.segid(corpus),gene1.segid(corpus)]).collect{|segid| segid.segment}
62
+ end
63
+
64
+ def test_clean_sort
65
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
66
+ Document.setup(text, "TEST", "test_doc1", nil)
67
+
68
+ corpus = {}
69
+ corpus.extend Document::Corpus
70
+
71
+ corpus.add_document(text)
72
+
73
+ gene1 = "TP53"
74
+ gene1.extend Segment
75
+ gene1.offset = text.index gene1
76
+ gene1.docid = text.docid
77
+
78
+ gene2 = "CDK5R1"
79
+ gene2.extend Segment
80
+ gene2.offset = text.index gene2
81
+ gene2.docid = text.docid
82
+
83
+ gene3 = "TP53 gene"
84
+ gene3.extend Segment
85
+ gene3.offset = text.index gene1
86
+ gene3.docid = text.docid
87
+
88
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
89
+
90
+ assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
91
+ end
92
+
93
+ def test_split
94
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
95
+ Document.setup(text, "TEST", "test_doc1", nil)
96
+
97
+ corpus = {}
98
+ corpus.extend Document::Corpus
99
+
100
+ corpus.add_document(text)
101
+
102
+ gene1 = "TP53"
103
+ gene1.extend Segment
104
+ gene1.offset = text.index gene1
105
+ gene1.docid = text.docid
106
+
107
+ gene2 = "CDK5R1"
108
+ gene2.extend Segment
109
+ gene2.offset = text.index gene2
110
+ gene2.docid = text.docid
111
+
112
+ gene3 = "TP53 gene"
113
+ gene3.extend Segment
114
+ gene3.offset = text.index gene1
115
+ gene3.docid = text.docid
116
+
117
+ assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3])
118
+
119
+ assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3].collect{|s| s.segid})
120
+ end
121
+
122
+
123
+ def test_align
124
+ text =<<-EOF
125
+ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
126
+ EOF
127
+
128
+ parts = text.split(/\W/)
129
+ Segment.align(text, parts)
130
+
131
+ assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
132
+
133
+ Document.setup(text, "TEST", "test_doc1", nil)
134
+
135
+ parts = text.split(/\W/)
136
+ Segment.align(text, parts)
137
+
138
+ assert_equal parts.first.docid, text.docid
139
+ end
140
+
141
+ def test_segment_index
142
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
143
+ Document.setup(text, "TEST", "test_doc1", nil)
144
+
145
+ corpus = {}
146
+ corpus.extend Document::Corpus
147
+
148
+ corpus.add_document(text)
149
+
150
+ gene1 = "TP53"
151
+ gene1.extend Segment
152
+ gene1.offset = text.index gene1
153
+ gene1.docid = text.docid
154
+
155
+ gene2 = "CDK5R1"
156
+ gene2.extend Segment
157
+ gene2.offset = text.index gene2
158
+ gene2.docid = text.docid
159
+
160
+ gene3 = "TP53 gene"
161
+ gene3.extend Segment
162
+ gene3.offset = text.index gene1
163
+ gene3.docid = text.docid
164
+
165
+ index = Segment.index([gene1, gene2, gene3], corpus)
166
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
167
+
168
+ TmpFile.with_file do |fwt|
169
+ index = Segment.index([gene1, gene2, gene3], corpus, fwt)
170
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
171
+ index = Segment.index([gene1, gene2, gene3], corpus, fwt)
172
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
173
+ end
174
+
175
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus)
176
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
177
+
178
+ TmpFile.with_file do |fwt|
179
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
180
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
181
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
182
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
183
+ end
184
+ end
185
+
186
+ end
187
+
@@ -6,7 +6,7 @@ require 'rbbt'
6
6
  require 'rbbt/persist'
7
7
  require 'rbbt/util/tmpfile'
8
8
  require 'rbbt/util/log'
9
- require 'rbbt/text/corpus'
9
+ #require 'rbbt/text/corpus'
10
10
 
11
11
  class Test::Unit::TestCase
12
12
  def get_test_datafile(file)
@@ -22,8 +22,10 @@ class Test::Unit::TestCase
22
22
  FileUtils.rm_rf Rbbt.tmp.test.find :user
23
23
  Persist::CONNECTIONS.values.each do |c| c.close end
24
24
  Persist::CONNECTIONS.clear
25
- Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
26
- Corpus::DocumentRepo::TC_CONNECTIONS.clear
25
+ if defined? Corpus
26
+ Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
27
+ Corpus::DocumentRepo::TC_CONNECTIONS.clear
28
+ end
27
29
  end
28
30
 
29
31
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-16 00:00:00.000000000 Z
11
+ date: 2020-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -78,6 +78,10 @@ files:
78
78
  - lib/rbbt/bow/bow.rb
79
79
  - lib/rbbt/bow/dictionary.rb
80
80
  - lib/rbbt/bow/misc.rb
81
+ - lib/rbbt/document.rb
82
+ - lib/rbbt/document/annotation.rb
83
+ - lib/rbbt/document/corpus.rb
84
+ - lib/rbbt/document/corpus/pubmed.rb
81
85
  - lib/rbbt/ner/NER.rb
82
86
  - lib/rbbt/ner/abner.rb
83
87
  - lib/rbbt/ner/banner.rb
@@ -98,18 +102,16 @@ files:
98
102
  - lib/rbbt/nlp/genia/sentence_splitter.rb
99
103
  - lib/rbbt/nlp/nlp.rb
100
104
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
101
- - lib/rbbt/text/corpus.rb
102
- - lib/rbbt/text/corpus/document.rb
103
- - lib/rbbt/text/corpus/document_repo.rb
104
- - lib/rbbt/text/corpus/sources/pmid.rb
105
- - lib/rbbt/text/document.rb
106
- - lib/rbbt/text/segment.rb
107
- - lib/rbbt/text/segment/docid.rb
108
- - lib/rbbt/text/segment/named_entity.rb
109
- - lib/rbbt/text/segment/relationship.rb
110
- - lib/rbbt/text/segment/segmented.rb
111
- - lib/rbbt/text/segment/token.rb
112
- - lib/rbbt/text/segment/transformed.rb
105
+ - lib/rbbt/segment.rb
106
+ - lib/rbbt/segment/annotation.rb
107
+ - lib/rbbt/segment/encoding.rb
108
+ - lib/rbbt/segment/named_entity.rb
109
+ - lib/rbbt/segment/overlaps.rb
110
+ - lib/rbbt/segment/range_index.rb
111
+ - lib/rbbt/segment/segmented.rb
112
+ - lib/rbbt/segment/token.rb
113
+ - lib/rbbt/segment/transformed.rb
114
+ - lib/rbbt/segment/tsv.rb
113
115
  - share/install/software/ABNER
114
116
  - share/install/software/BANNER
115
117
  - share/install/software/ChemicalTagger
@@ -128,6 +130,9 @@ files:
128
130
  - test/rbbt/bow/test_bow.rb
129
131
  - test/rbbt/bow/test_dictionary.rb
130
132
  - test/rbbt/bow/test_misc.rb
133
+ - test/rbbt/document/corpus/test_pubmed.rb
134
+ - test/rbbt/document/test_annotation.rb
135
+ - test/rbbt/document/test_corpus.rb
131
136
  - test/rbbt/entity/test_document.rb
132
137
  - test/rbbt/ner/test_NER.rb
133
138
  - test/rbbt/ner/test_abner.rb
@@ -146,15 +151,15 @@ files:
146
151
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
147
152
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
148
153
  - test/rbbt/nlp/test_nlp.rb
149
- - test/rbbt/text/corpus/sources/test_pmid.rb
150
- - test/rbbt/text/corpus/test_document.rb
151
- - test/rbbt/text/segment/test_named_entity.rb
152
- - test/rbbt/text/segment/test_relationship.rb
153
- - test/rbbt/text/segment/test_segmented.rb
154
- - test/rbbt/text/segment/test_transformed.rb
155
- - test/rbbt/text/test_corpus.rb
156
- - test/rbbt/text/test_document.rb
157
- - test/rbbt/text/test_segment.rb
154
+ - test/rbbt/segment/test_annotation.rb
155
+ - test/rbbt/segment/test_corpus.rb
156
+ - test/rbbt/segment/test_encoding.rb
157
+ - test/rbbt/segment/test_named_entity.rb
158
+ - test/rbbt/segment/test_overlaps.rb
159
+ - test/rbbt/segment/test_range_index.rb
160
+ - test/rbbt/segment/test_transformed.rb
161
+ - test/rbbt/test_document.rb
162
+ - test/rbbt/test_segment.rb
158
163
  - test/test_helper.rb
159
164
  homepage: http://github.com/mikisvaz/rbbt-util
160
165
  licenses: []
@@ -182,18 +187,13 @@ test_files:
182
187
  - test/rbbt/nlp/test_nlp.rb
183
188
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
184
189
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
185
- - test/rbbt/text/test_document.rb
186
- - test/rbbt/text/corpus/sources/test_pmid.rb
187
- - test/rbbt/text/corpus/test_document.rb
188
- - test/rbbt/text/test_segment.rb
189
- - test/rbbt/text/test_corpus.rb
190
- - test/rbbt/text/segment/test_transformed.rb
191
- - test/rbbt/text/segment/test_relationship.rb
192
- - test/rbbt/text/segment/test_named_entity.rb
193
- - test/rbbt/text/segment/test_segmented.rb
194
190
  - test/rbbt/bow/test_bow.rb
195
191
  - test/rbbt/bow/test_misc.rb
196
192
  - test/rbbt/bow/test_dictionary.rb
193
+ - test/rbbt/test_document.rb
194
+ - test/rbbt/document/test_annotation.rb
195
+ - test/rbbt/document/corpus/test_pubmed.rb
196
+ - test/rbbt/document/test_corpus.rb
197
197
  - test/rbbt/entity/test_document.rb
198
198
  - test/rbbt/ner/test_patterns.rb
199
199
  - test/rbbt/ner/test_NER.rb
@@ -209,4 +209,12 @@ test_files:
209
209
  - test/rbbt/ner/test_finder.rb
210
210
  - test/rbbt/ner/test_linnaeus.rb
211
211
  - test/rbbt/ner/test_oscar4.rb
212
+ - test/rbbt/test_segment.rb
213
+ - test/rbbt/segment/test_transformed.rb
214
+ - test/rbbt/segment/test_overlaps.rb
215
+ - test/rbbt/segment/test_annotation.rb
216
+ - test/rbbt/segment/test_named_entity.rb
217
+ - test/rbbt/segment/test_encoding.rb
218
+ - test/rbbt/segment/test_range_index.rb
219
+ - test/rbbt/segment/test_corpus.rb
212
220
  - test/test_helper.rb
@@ -1,106 +0,0 @@
1
- require 'rbbt/text/corpus/document'
2
- require 'rbbt/text/corpus/document_repo'
3
-
4
- class Corpus
5
- class << self
6
- attr_accessor :claims
7
- def claim(namespace, &block)
8
- @@claims = {}
9
- @@claims[namespace] = block
10
- end
11
-
12
- end
13
- attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
14
-
15
- def initialize(corpora_path = nil)
16
- @corpora_path = case
17
- when corpora_path.nil?
18
- Rbbt.corpora
19
- when (not Path === corpora_path)
20
- Path.setup(corpora_path)
21
- else
22
- corpora_path
23
- end
24
-
25
- @corpora_path = @corpora_path.find
26
- @persistence_dir = File.join(@corpora_path, "annotations")
27
-
28
- Misc.lock(@persistence_dir) do
29
- @global_annotations = TSV.setup(Persist.open_tokyocabinet(File.join(@persistence_dir, "global_annotations"), false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"])
30
- @global_annotations.unnamed = true
31
- @global_annotations.close
32
- end
33
-
34
- Misc.lock(@corpora_path.document_repo) do
35
- @document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
36
- @document_repo.close
37
- end
38
-
39
- end
40
-
41
- def persistence_for(docid)
42
- File.join(persistence_dir, docid)
43
- end
44
-
45
-
46
- def docid(docid)
47
- begin
48
- if @document_repo.include?(docid)
49
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations, self)
50
- else
51
- namespace, id, type = docid.split(":")
52
- if @@claims.include?(namespace)
53
-
54
- docid = self.instance_exec id, type, &(@@claims[namespace])
55
- docid = docid.first if Array === docid
56
- self.docid(docid)
57
- else
58
- raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
59
- end
60
- end
61
- ensure
62
- @document_repo.close
63
- end
64
- end
65
-
66
- def document(namespace, id, type, hash)
67
- docid = [namespace, id, type, hash] * ":"
68
- self.docid(docid)
69
- end
70
-
71
- def add_document(text, namespace = nil, id = nil, type = nil)
72
- text = Misc.fixutf8(text)
73
- hash = Digest::MD5.hexdigest(text)
74
- @document_repo.add(text, namespace, id, type, hash)
75
- end
76
-
77
- def add_docid(text, docid)
78
- namespace, id, type, hash = docid.split(":")
79
- @document_repo.add(text, namespace, id, type, hash)
80
- end
81
-
82
-
83
- def find(namespace=nil, id = nil, type = nil, hash = nil)
84
- @document_repo.find(namespace, id, type, hash).collect{|docid|
85
- self.docid(docid)
86
- }
87
- end
88
-
89
- def find_docid(docid)
90
- @document_repo.find_docid(docid).collect{|docid|
91
- self.docid(docid)
92
- }
93
- end
94
-
95
- def exists?(namespace=nil, id = nil, type = nil, hash = nil)
96
- find(namespace, id, type, hash).any?
97
- end
98
-
99
- def [](docid)
100
- self.docid(docid)
101
- end
102
-
103
- def include?(id)
104
- @document_repo.include? id
105
- end
106
- end