rbbt-text 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,14 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+
4
+ class TestDocument < Test::Unit::TestCase
5
+
6
+ def test_docid
7
+ text = "This is a document"
8
+ Document.setup(text, "TEST", "test_doc1", nil)
9
+
10
+ assert_equal ["TEST", "test_doc1", nil, Misc.digest(text)] * ":", text.docid
11
+ end
12
+
13
+ end
14
+
@@ -0,0 +1,187 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/segment'
5
+
6
+ class TestSegment < Test::Unit::TestCase
7
+ def test_segment
8
+ text = "This is a document"
9
+ Document.setup(text, "TEST", "test_doc1", nil)
10
+
11
+ segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
12
+
13
+ assert_equal text.docid + ":" + segment.offset.to_s + ".." + segment.eend.to_s, segment.segid
14
+ end
15
+
16
+ def test_segid
17
+ text = "This is a document"
18
+ Document.setup(text, "TEST", "test_doc1", nil)
19
+
20
+ corpus = {}
21
+ corpus.extend Document::Corpus
22
+
23
+ corpus.add_document(text)
24
+
25
+ segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
26
+
27
+ segid = segment.segid(corpus)
28
+
29
+ segment = segid.segment
30
+ assert_equal "is", segment
31
+ end
32
+
33
+ def test_info
34
+ segment = "test"
35
+ segment.extend Segment
36
+ segment.offset = 10
37
+ assert segment.info.include? :offset
38
+ end
39
+
40
+ def test_sort
41
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
42
+ Document.setup(text, "TEST", "test_doc1", nil)
43
+
44
+ corpus = {}
45
+ corpus.extend Document::Corpus
46
+
47
+ corpus.add_document(text)
48
+
49
+ gene1 = "TP53"
50
+ gene1.extend Segment
51
+ gene1.offset = text.index gene1
52
+ gene1.docid = text.docid
53
+
54
+ gene2 = "CDK5R1"
55
+ gene2.extend Segment
56
+ gene2.offset = text.index gene2
57
+ gene2.docid = text.docid
58
+
59
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
60
+
61
+ assert_equal [gene1,gene2], Segment.sort([gene2.segid(corpus),gene1.segid(corpus)]).collect{|segid| segid.segment}
62
+ end
63
+
64
+ def test_clean_sort
65
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
66
+ Document.setup(text, "TEST", "test_doc1", nil)
67
+
68
+ corpus = {}
69
+ corpus.extend Document::Corpus
70
+
71
+ corpus.add_document(text)
72
+
73
+ gene1 = "TP53"
74
+ gene1.extend Segment
75
+ gene1.offset = text.index gene1
76
+ gene1.docid = text.docid
77
+
78
+ gene2 = "CDK5R1"
79
+ gene2.extend Segment
80
+ gene2.offset = text.index gene2
81
+ gene2.docid = text.docid
82
+
83
+ gene3 = "TP53 gene"
84
+ gene3.extend Segment
85
+ gene3.offset = text.index gene1
86
+ gene3.docid = text.docid
87
+
88
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
89
+
90
+ assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
91
+ end
92
+
93
+ def test_split
94
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
95
+ Document.setup(text, "TEST", "test_doc1", nil)
96
+
97
+ corpus = {}
98
+ corpus.extend Document::Corpus
99
+
100
+ corpus.add_document(text)
101
+
102
+ gene1 = "TP53"
103
+ gene1.extend Segment
104
+ gene1.offset = text.index gene1
105
+ gene1.docid = text.docid
106
+
107
+ gene2 = "CDK5R1"
108
+ gene2.extend Segment
109
+ gene2.offset = text.index gene2
110
+ gene2.docid = text.docid
111
+
112
+ gene3 = "TP53 gene"
113
+ gene3.extend Segment
114
+ gene3.offset = text.index gene1
115
+ gene3.docid = text.docid
116
+
117
+ assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3])
118
+
119
+ assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3].collect{|s| s.segid})
120
+ end
121
+
122
+
123
+ def test_align
124
+ text =<<-EOF
125
+ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
126
+ EOF
127
+
128
+ parts = text.split(/\W/)
129
+ Segment.align(text, parts)
130
+
131
+ assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
132
+
133
+ Document.setup(text, "TEST", "test_doc1", nil)
134
+
135
+ parts = text.split(/\W/)
136
+ Segment.align(text, parts)
137
+
138
+ assert_equal parts.first.docid, text.docid
139
+ end
140
+
141
+ def test_segment_index
142
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
143
+ Document.setup(text, "TEST", "test_doc1", nil)
144
+
145
+ corpus = {}
146
+ corpus.extend Document::Corpus
147
+
148
+ corpus.add_document(text)
149
+
150
+ gene1 = "TP53"
151
+ gene1.extend Segment
152
+ gene1.offset = text.index gene1
153
+ gene1.docid = text.docid
154
+
155
+ gene2 = "CDK5R1"
156
+ gene2.extend Segment
157
+ gene2.offset = text.index gene2
158
+ gene2.docid = text.docid
159
+
160
+ gene3 = "TP53 gene"
161
+ gene3.extend Segment
162
+ gene3.offset = text.index gene1
163
+ gene3.docid = text.docid
164
+
165
+ index = Segment.index([gene1, gene2, gene3], corpus)
166
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
167
+
168
+ TmpFile.with_file do |fwt|
169
+ index = Segment.index([gene1, gene2, gene3], corpus, fwt)
170
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
171
+ index = Segment.index([gene1, gene2, gene3], corpus, fwt)
172
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
173
+ end
174
+
175
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus)
176
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
177
+
178
+ TmpFile.with_file do |fwt|
179
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
180
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
181
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
182
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
183
+ end
184
+ end
185
+
186
+ end
187
+
@@ -6,7 +6,7 @@ require 'rbbt'
6
6
  require 'rbbt/persist'
7
7
  require 'rbbt/util/tmpfile'
8
8
  require 'rbbt/util/log'
9
- require 'rbbt/text/corpus'
9
+ #require 'rbbt/text/corpus'
10
10
 
11
11
  class Test::Unit::TestCase
12
12
  def get_test_datafile(file)
@@ -22,8 +22,10 @@ class Test::Unit::TestCase
22
22
  FileUtils.rm_rf Rbbt.tmp.test.find :user
23
23
  Persist::CONNECTIONS.values.each do |c| c.close end
24
24
  Persist::CONNECTIONS.clear
25
- Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
26
- Corpus::DocumentRepo::TC_CONNECTIONS.clear
25
+ if defined? Corpus
26
+ Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
27
+ Corpus::DocumentRepo::TC_CONNECTIONS.clear
28
+ end
27
29
  end
28
30
 
29
31
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-16 00:00:00.000000000 Z
11
+ date: 2020-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -78,6 +78,10 @@ files:
78
78
  - lib/rbbt/bow/bow.rb
79
79
  - lib/rbbt/bow/dictionary.rb
80
80
  - lib/rbbt/bow/misc.rb
81
+ - lib/rbbt/document.rb
82
+ - lib/rbbt/document/annotation.rb
83
+ - lib/rbbt/document/corpus.rb
84
+ - lib/rbbt/document/corpus/pubmed.rb
81
85
  - lib/rbbt/ner/NER.rb
82
86
  - lib/rbbt/ner/abner.rb
83
87
  - lib/rbbt/ner/banner.rb
@@ -98,18 +102,16 @@ files:
98
102
  - lib/rbbt/nlp/genia/sentence_splitter.rb
99
103
  - lib/rbbt/nlp/nlp.rb
100
104
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
101
- - lib/rbbt/text/corpus.rb
102
- - lib/rbbt/text/corpus/document.rb
103
- - lib/rbbt/text/corpus/document_repo.rb
104
- - lib/rbbt/text/corpus/sources/pmid.rb
105
- - lib/rbbt/text/document.rb
106
- - lib/rbbt/text/segment.rb
107
- - lib/rbbt/text/segment/docid.rb
108
- - lib/rbbt/text/segment/named_entity.rb
109
- - lib/rbbt/text/segment/relationship.rb
110
- - lib/rbbt/text/segment/segmented.rb
111
- - lib/rbbt/text/segment/token.rb
112
- - lib/rbbt/text/segment/transformed.rb
105
+ - lib/rbbt/segment.rb
106
+ - lib/rbbt/segment/annotation.rb
107
+ - lib/rbbt/segment/encoding.rb
108
+ - lib/rbbt/segment/named_entity.rb
109
+ - lib/rbbt/segment/overlaps.rb
110
+ - lib/rbbt/segment/range_index.rb
111
+ - lib/rbbt/segment/segmented.rb
112
+ - lib/rbbt/segment/token.rb
113
+ - lib/rbbt/segment/transformed.rb
114
+ - lib/rbbt/segment/tsv.rb
113
115
  - share/install/software/ABNER
114
116
  - share/install/software/BANNER
115
117
  - share/install/software/ChemicalTagger
@@ -128,6 +130,9 @@ files:
128
130
  - test/rbbt/bow/test_bow.rb
129
131
  - test/rbbt/bow/test_dictionary.rb
130
132
  - test/rbbt/bow/test_misc.rb
133
+ - test/rbbt/document/corpus/test_pubmed.rb
134
+ - test/rbbt/document/test_annotation.rb
135
+ - test/rbbt/document/test_corpus.rb
131
136
  - test/rbbt/entity/test_document.rb
132
137
  - test/rbbt/ner/test_NER.rb
133
138
  - test/rbbt/ner/test_abner.rb
@@ -146,15 +151,15 @@ files:
146
151
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
147
152
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
148
153
  - test/rbbt/nlp/test_nlp.rb
149
- - test/rbbt/text/corpus/sources/test_pmid.rb
150
- - test/rbbt/text/corpus/test_document.rb
151
- - test/rbbt/text/segment/test_named_entity.rb
152
- - test/rbbt/text/segment/test_relationship.rb
153
- - test/rbbt/text/segment/test_segmented.rb
154
- - test/rbbt/text/segment/test_transformed.rb
155
- - test/rbbt/text/test_corpus.rb
156
- - test/rbbt/text/test_document.rb
157
- - test/rbbt/text/test_segment.rb
154
+ - test/rbbt/segment/test_annotation.rb
155
+ - test/rbbt/segment/test_corpus.rb
156
+ - test/rbbt/segment/test_encoding.rb
157
+ - test/rbbt/segment/test_named_entity.rb
158
+ - test/rbbt/segment/test_overlaps.rb
159
+ - test/rbbt/segment/test_range_index.rb
160
+ - test/rbbt/segment/test_transformed.rb
161
+ - test/rbbt/test_document.rb
162
+ - test/rbbt/test_segment.rb
158
163
  - test/test_helper.rb
159
164
  homepage: http://github.com/mikisvaz/rbbt-util
160
165
  licenses: []
@@ -182,18 +187,13 @@ test_files:
182
187
  - test/rbbt/nlp/test_nlp.rb
183
188
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
184
189
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
185
- - test/rbbt/text/test_document.rb
186
- - test/rbbt/text/corpus/sources/test_pmid.rb
187
- - test/rbbt/text/corpus/test_document.rb
188
- - test/rbbt/text/test_segment.rb
189
- - test/rbbt/text/test_corpus.rb
190
- - test/rbbt/text/segment/test_transformed.rb
191
- - test/rbbt/text/segment/test_relationship.rb
192
- - test/rbbt/text/segment/test_named_entity.rb
193
- - test/rbbt/text/segment/test_segmented.rb
194
190
  - test/rbbt/bow/test_bow.rb
195
191
  - test/rbbt/bow/test_misc.rb
196
192
  - test/rbbt/bow/test_dictionary.rb
193
+ - test/rbbt/test_document.rb
194
+ - test/rbbt/document/test_annotation.rb
195
+ - test/rbbt/document/corpus/test_pubmed.rb
196
+ - test/rbbt/document/test_corpus.rb
197
197
  - test/rbbt/entity/test_document.rb
198
198
  - test/rbbt/ner/test_patterns.rb
199
199
  - test/rbbt/ner/test_NER.rb
@@ -209,4 +209,12 @@ test_files:
209
209
  - test/rbbt/ner/test_finder.rb
210
210
  - test/rbbt/ner/test_linnaeus.rb
211
211
  - test/rbbt/ner/test_oscar4.rb
212
+ - test/rbbt/test_segment.rb
213
+ - test/rbbt/segment/test_transformed.rb
214
+ - test/rbbt/segment/test_overlaps.rb
215
+ - test/rbbt/segment/test_annotation.rb
216
+ - test/rbbt/segment/test_named_entity.rb
217
+ - test/rbbt/segment/test_encoding.rb
218
+ - test/rbbt/segment/test_range_index.rb
219
+ - test/rbbt/segment/test_corpus.rb
212
220
  - test/test_helper.rb
@@ -1,106 +0,0 @@
1
- require 'rbbt/text/corpus/document'
2
- require 'rbbt/text/corpus/document_repo'
3
-
4
- class Corpus
5
- class << self
6
- attr_accessor :claims
7
- def claim(namespace, &block)
8
- @@claims = {}
9
- @@claims[namespace] = block
10
- end
11
-
12
- end
13
- attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
14
-
15
- def initialize(corpora_path = nil)
16
- @corpora_path = case
17
- when corpora_path.nil?
18
- Rbbt.corpora
19
- when (not Path === corpora_path)
20
- Path.setup(corpora_path)
21
- else
22
- corpora_path
23
- end
24
-
25
- @corpora_path = @corpora_path.find
26
- @persistence_dir = File.join(@corpora_path, "annotations")
27
-
28
- Misc.lock(@persistence_dir) do
29
- @global_annotations = TSV.setup(Persist.open_tokyocabinet(File.join(@persistence_dir, "global_annotations"), false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"])
30
- @global_annotations.unnamed = true
31
- @global_annotations.close
32
- end
33
-
34
- Misc.lock(@corpora_path.document_repo) do
35
- @document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
36
- @document_repo.close
37
- end
38
-
39
- end
40
-
41
- def persistence_for(docid)
42
- File.join(persistence_dir, docid)
43
- end
44
-
45
-
46
- def docid(docid)
47
- begin
48
- if @document_repo.include?(docid)
49
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations, self)
50
- else
51
- namespace, id, type = docid.split(":")
52
- if @@claims.include?(namespace)
53
-
54
- docid = self.instance_exec id, type, &(@@claims[namespace])
55
- docid = docid.first if Array === docid
56
- self.docid(docid)
57
- else
58
- raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
59
- end
60
- end
61
- ensure
62
- @document_repo.close
63
- end
64
- end
65
-
66
- def document(namespace, id, type, hash)
67
- docid = [namespace, id, type, hash] * ":"
68
- self.docid(docid)
69
- end
70
-
71
- def add_document(text, namespace = nil, id = nil, type = nil)
72
- text = Misc.fixutf8(text)
73
- hash = Digest::MD5.hexdigest(text)
74
- @document_repo.add(text, namespace, id, type, hash)
75
- end
76
-
77
- def add_docid(text, docid)
78
- namespace, id, type, hash = docid.split(":")
79
- @document_repo.add(text, namespace, id, type, hash)
80
- end
81
-
82
-
83
- def find(namespace=nil, id = nil, type = nil, hash = nil)
84
- @document_repo.find(namespace, id, type, hash).collect{|docid|
85
- self.docid(docid)
86
- }
87
- end
88
-
89
- def find_docid(docid)
90
- @document_repo.find_docid(docid).collect{|docid|
91
- self.docid(docid)
92
- }
93
- end
94
-
95
- def exists?(namespace=nil, id = nil, type = nil, hash = nil)
96
- find(namespace, id, type, hash).any?
97
- end
98
-
99
- def [](docid)
100
- self.docid(docid)
101
- end
102
-
103
- def include?(id)
104
- @document_repo.include? id
105
- end
106
- end