rbbt-text 1.1.8 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/ner/NER.rb +3 -3
  3. data/lib/rbbt/ner/abner.rb +3 -3
  4. data/lib/rbbt/ner/banner.rb +1 -1
  5. data/lib/rbbt/ner/brat.rb +2 -2
  6. data/lib/rbbt/ner/chemical_tagger.rb +1 -1
  7. data/lib/rbbt/ner/linnaeus.rb +1 -1
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +2 -2
  9. data/lib/rbbt/ner/oscar3.rb +1 -1
  10. data/lib/rbbt/ner/oscar4.rb +1 -1
  11. data/lib/rbbt/ner/patterns.rb +4 -4
  12. data/lib/rbbt/ner/regexpNER.rb +1 -1
  13. data/lib/rbbt/ner/token_trieNER.rb +2 -2
  14. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  15. data/lib/rbbt/nlp/nlp.rb +2 -2
  16. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
  17. data/lib/rbbt/{corpus → text}/corpus.rb +51 -11
  18. data/lib/rbbt/text/corpus/document.rb +361 -0
  19. data/lib/rbbt/text/corpus/document_repo.rb +68 -0
  20. data/lib/rbbt/text/corpus/sources/pmid.rb +34 -0
  21. data/lib/rbbt/text/document.rb +39 -0
  22. data/lib/rbbt/{ner → text}/segment.rb +11 -6
  23. data/lib/rbbt/{ner → text}/segment/docid.rb +1 -1
  24. data/lib/rbbt/{ner → text}/segment/named_entity.rb +2 -2
  25. data/lib/rbbt/{ner → text}/segment/relationship.rb +1 -1
  26. data/lib/rbbt/{ner → text}/segment/segmented.rb +1 -1
  27. data/lib/rbbt/{ner → text}/segment/token.rb +1 -1
  28. data/lib/rbbt/{ner → text}/segment/transformed.rb +47 -42
  29. data/test/rbbt/entity/test_document.rb +1 -0
  30. data/test/rbbt/ner/test_abner.rb +1 -0
  31. data/test/rbbt/ner/test_linnaeus.rb +1 -0
  32. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +0 -1
  33. data/test/rbbt/text/corpus/sources/test_pmid.rb +33 -0
  34. data/test/rbbt/text/corpus/test_document.rb +52 -0
  35. data/test/rbbt/{ner → text}/segment/test_named_entity.rb +2 -2
  36. data/test/rbbt/{ner → text}/segment/test_relationship.rb +0 -0
  37. data/test/rbbt/{ner → text}/segment/test_segmented.rb +1 -1
  38. data/test/rbbt/{ner → text}/segment/test_transformed.rb +96 -3
  39. data/test/rbbt/text/test_corpus.rb +34 -0
  40. data/test/rbbt/text/test_document.rb +58 -0
  41. data/test/rbbt/{ner → text}/test_segment.rb +2 -2
  42. data/test/test_helper.rb +3 -3
  43. metadata +32 -24
  44. data/lib/rbbt/corpus/document.rb +0 -266
  45. data/lib/rbbt/corpus/document_repo.rb +0 -137
  46. data/lib/rbbt/corpus/sources/pubmed.rb +0 -27
  47. data/lib/rbbt/entity/document.rb +0 -75
@@ -0,0 +1,68 @@
1
+ require 'rbbt/util/misc'
2
+ require 'tokyocabinet'
3
+
4
+ class Corpus
5
+ module DocumentRepo
6
+ class OpenError < StandardError;end
7
+ class KeyFormatError < StandardError;end
8
+
9
+ TC_CONNECTIONS = {}
10
+ def self.open_tokyocabinet(path, write)
11
+ database = Persist.open_tokyocabinet(path, write, :single, TokyoCabinet::BDB)
12
+ database.extend DocumentRepo
13
+ database
14
+ end
15
+
16
+ def docid2fields(docid)
17
+ docid.split(":", -1).values_at 0,1,2,3
18
+ end
19
+
20
+ def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
21
+ [namespace, id, type, hash] * ":"
22
+ end
23
+
24
+ def docid(docid)
25
+ get(docid)
26
+ end
27
+
28
+ def add(text, namespace, id, type, hash)
29
+ docid = fields2docid(namespace, id, type, hash)
30
+
31
+ return docid if self.include?(docid)
32
+
33
+ write_and_close do
34
+ self[docid] = text
35
+ end
36
+
37
+ docid
38
+ end
39
+
40
+ def find(namespace=nil, id = nil, type = nil, hash = nil)
41
+ case
42
+ when namespace.nil?
43
+ self.keys
44
+ when id.nil?
45
+ range_start = [namespace] * ":" + ':'
46
+ range_end = [namespace] * ":" + ';'
47
+ self.range(range_start, true, range_end, false)
48
+ when (type and hash)
49
+ [[namespace, id, type, hash] * ":"]
50
+ when hash
51
+ [[namespace, id, "", hash] * ":"]
52
+ when type
53
+ range_start = [namespace, id, type] * ":" + ':'
54
+ range_end = [namespace, id, type] * ":" + ';'
55
+ self.range(range_start, true, range_end, false)
56
+ else
57
+ range_start = [namespace, id] * ":" + ':'
58
+ range_end = [namespace, id] * ":" + ';'
59
+ self.range(range_start, true, range_end, false)
60
+ end
61
+ end
62
+
63
+ def find_docid(docid)
64
+ find(*docid2fields(docid))
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,34 @@
1
+ require 'rbbt/sources/pubmed'
2
+
3
+ class Corpus
4
+
5
+ NAMESPACES = {} unless defined? NAMESPACES
6
+ NAMESPACES[:pubmed] = :add_pmid
7
+
8
+ def add_pmid(pmid, type = nil)
9
+ pmids = Array === pmid ? pmid : [pmid]
10
+ type = nil if String === type and type.empty?
11
+
12
+ PubMed.get_article(pmids).collect do |pmid, article|
13
+ Log.debug "Loading pmid #{pmid}"
14
+ if type.nil? || type.to_sym == :abstract
15
+ add_document(article.abstract || "", :PMID, pmid, :abstract)
16
+ elsif type.to_sym == :title
17
+ add_document(article.title, :PMID, pmid, :title)
18
+ else
19
+ raise "No FullText available for #{ pmid }" if article.full_text.nil?
20
+ add_document(article.full_text, :PMID, pmid, :fulltext)
21
+ end
22
+ end
23
+ end
24
+
25
+ def add_pubmed_query(query, max = 3000, type = nil)
26
+ pmids = PubMed.query(query, max)
27
+ add_pmid(pmids, type)
28
+ end
29
+
30
+ self.claim "PMID" do |id, type|
31
+ Log.debug "Claiming #{id}"
32
+ self.add_pmid(id, type)
33
+ end
34
+ end
@@ -0,0 +1,39 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+
4
+ require 'rbbt/text/corpus'
5
+
6
+ module Document
7
+ extend Entity
8
+ class << self
9
+ attr_accessor :corpus
10
+ end
11
+
12
+ property :document => :single do
13
+ Document.corpus.docid(self)
14
+ end
15
+
16
+ property :type => :single do |type|
17
+ self.annotate((self.split(":").values_at(0,1)) * ":" + ":" + type.to_s)
18
+ end
19
+
20
+ property :title => :single do
21
+ type(:title).text
22
+ end
23
+
24
+ property :full_text => :single do
25
+ type(:full_text).text
26
+ end
27
+
28
+ property :abstract => :single do
29
+ type(:abstract).text
30
+ end
31
+
32
+ property :text => :single do
33
+ document.text
34
+ end
35
+
36
+ property :entities => :single do |type,*args|
37
+ document.method(type).call *args
38
+ end
39
+ end
@@ -77,9 +77,14 @@ module Segment
77
77
  self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.end
78
78
  end
79
79
 
80
+ def overlaps(segments)
81
+ segments.select{|s| self.overlaps?(s)}
82
+ end
83
+
84
+
80
85
  def self.collisions(main, secondary)
81
- collisions = secondary.select do |ss|
82
- collisions = main.select{|ms| ms.overlaps? ss }.any?
86
+ secondary.select do |ss|
87
+ main.select{|ms| ms.overlaps? ss }.any?
83
88
  end
84
89
  end
85
90
 
@@ -320,7 +325,7 @@ module Segment
320
325
  tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
321
326
 
322
327
  segments.each do |segment|
323
- tsv[segment.id] = self.tsv_values_for_segment(segment, fields)
328
+ tsv[segment.object_id.to_s] = self.tsv_values_for_segment(segment, fields)
324
329
  end
325
330
 
326
331
  tsv
@@ -343,8 +348,8 @@ module Segment
343
348
  [offset, self.end] * ".."
344
349
  end
345
350
 
346
- def ==(other)
347
- self.id == other.id
348
- end
351
+ #def ==(other)
352
+ # self.text == other.text
353
+ #end
349
354
  end
350
355
 
@@ -1,4 +1,4 @@
1
- require 'rbbt/ner/segment'
1
+ require 'rbbt/text/segment'
2
2
 
3
3
  module SegmentWithDocid
4
4
  extend Annotation
@@ -1,4 +1,4 @@
1
- require 'rbbt/ner/segment'
1
+ require 'rbbt/text/segment'
2
2
  require 'rbbt/entity'
3
3
 
4
4
  module NamedEntity
@@ -32,7 +32,7 @@ Score: #{score.inspect}
32
32
  format, entity = code.split(":")
33
33
  entity, format = format, nil if entity.nil?
34
34
 
35
- if defined? Entity and Entity.formats.include? type or Entity.formats.include? format
35
+ if defined?(Entity) && Entity.formats.include?(type) or Entity.formats.include?(format)
36
36
  params ||= {}
37
37
  params[:format] = format if format and params[:format].nil?
38
38
  mod = (Entity.formats[type] || Entity.format[entity])
@@ -1,4 +1,4 @@
1
- require 'rbbt/ner/segment'
1
+ require 'rbbt/text/segment'
2
2
 
3
3
  module Relationship
4
4
  extend Annotation
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/annotations'
2
- require 'rbbt/ner/segment'
2
+ require 'rbbt/text/segment'
3
3
 
4
4
  module Segmented
5
5
  extend Annotation
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/annotations'
2
- require 'rbbt/ner/segment'
2
+ require 'rbbt/text/segment'
3
3
 
4
4
  module Token
5
5
  attr_accessor :offset, :original
@@ -1,10 +1,12 @@
1
1
  require 'rbbt/util/misc'
2
- require 'rbbt/ner/segment'
2
+ require 'rbbt/text/segment'
3
3
 
4
4
  module Transformed
5
5
 
6
6
  def self.transform(text, segments, replacement = nil, &block)
7
7
 
8
+ block = replacement if Proc === replacement
9
+
8
10
  text.extend Transformed
9
11
  text.replace_segments(segments, replacement, &block)
10
12
 
@@ -24,68 +26,44 @@ module Transformed
24
26
  end
25
27
 
26
28
  attr_accessor :transformed_segments, :transformation_stack
27
-
29
+
28
30
  def shift(segment_o)
29
31
  begin_shift = 0
30
32
  end_shift = 0
31
33
 
34
+ text_offset = self.respond_to?(:offset)? self.offset.to_i : 0
32
35
  @transformed_segments.sort_by{|id, info| info.last}.each{|id,info|
33
- pseg_o, diff = info
36
+ pseg_o, diff, utext, pseg_u, index = info
37
+
38
+ pseg_u = ((pseg_u.begin + text_offset)..(pseg_u.last + text_offset))
34
39
 
35
40
  case
36
41
  # Before
37
- when segment_o.last + end_shift < pseg_o.begin
42
+ when segment_o.last + end_shift < pseg_u.begin
38
43
  # After
39
- when (segment_o.begin + begin_shift > pseg_o.last)
44
+ when (segment_o.begin + begin_shift > pseg_u.last)
40
45
  begin_shift += diff
41
46
  end_shift += diff
42
47
  # Includes
43
- when (segment_o.begin + begin_shift <= pseg_o.begin and segment_o.last + end_shift >= pseg_o.last)
48
+ when (segment_o.begin + begin_shift <= pseg_u.begin and segment_o.last + end_shift >= pseg_u.last)
44
49
  end_shift += diff
45
50
  # Inside
46
- when (segment_o.begin + begin_shift >= pseg_o.begin and segment_o.last + end_shift <= pseg_o.last)
51
+ when (segment_o.begin + begin_shift >= pseg_u.begin and segment_o.last + end_shift <= pseg_u.last)
47
52
  return nil
48
53
  # Overlaps start
49
- when (segment_o.begin + begin_shift <= pseg_o.begin and segment_o.last + end_shift <= pseg_o.last)
54
+ when (segment_o.begin + begin_shift <= pseg_u.begin and segment_o.last + end_shift <= pseg_u.last)
50
55
  return nil
51
56
  # Overlaps end
52
- when (segment_o.begin + begin_shift >= pseg_o.begin and segment_o.last + end_shift >= pseg_o.last)
57
+ when (segment_o.begin + begin_shift >= pseg_u.begin and segment_o.last + end_shift >= pseg_u.last)
53
58
  return nil
54
- else
55
- raise "Unknown overlaps: #{segment_o.inspect} - #{pseg_o.inspect}"
59
+ else
60
+ raise "Unknown overlaps: #{segment_o.inspect} - #{pseg_u.inspect}"
56
61
  end
57
62
  }
58
63
 
59
64
  [begin_shift, end_shift]
60
65
  end
61
66
 
62
- #def self.sort(segments)
63
- # segments.compact.sort do |a,b|
64
- # case
65
- # when ((a.nil? && b.nil?) || (a.offset.nil? && b.offset.nil?))
66
- # 0
67
- # when (a.nil? || a.offset.nil?)
68
- # -1
69
- # when (b.nil? || b.offset.nil?)
70
- # +1
71
- # # Non-overlap
72
- # when (a.end < b.offset.to_i || b.end < a.offset.to_i)
73
- # b.offset <=> a.offset
74
- # # b includes a
75
- # when (a.offset.to_i >= b.offset.to_i && a.end <= b.end)
76
- # -1
77
- # # b includes a
78
- # when (b.offset.to_i >= a.offset.to_i && b.end <= a.end)
79
- # +1
80
- # # Overlap
81
- # when (a.offset.to_i > b.offset.to_i && a.end > b.end || b.offset.to_i > a.offset.to_i && b.end > a.end)
82
- # b.length <=> a.length
83
- # else
84
- # raise "Unexpected case in sort: #{a.range} - #{b.range}"
85
- # end
86
- # end
87
- #end
88
-
89
67
  def replace_segments(segments, replacement = nil, &block)
90
68
  @transformed_segments ||= {}
91
69
  @transformation_stack ||= []
@@ -93,8 +71,9 @@ module Transformed
93
71
 
94
72
  segments = [segments] unless Array === segments
95
73
  orig_length = self.length
96
- Segment.sort(segments).each do |segment|
74
+ Segment.clean_sort(segments).each do |segment|
97
75
  next if segment.offset.nil?
76
+
98
77
  shift = shift segment.range
99
78
 
100
79
  next if shift.nil?
@@ -102,6 +81,7 @@ module Transformed
102
81
  shift_begin, shift_end = shift
103
82
 
104
83
  text_offset = self.respond_to?(:offset)? self.offset.to_i : 0
84
+
105
85
  updated_begin = segment.offset.to_i + shift_begin - text_offset
106
86
  updated_end = segment.range.last + shift_end - text_offset
107
87
 
@@ -113,6 +93,8 @@ module Transformed
113
93
  next
114
94
  end
115
95
 
96
+ #raise "error '#{segment}' => '#{updated_text}'" if updated_text != segment
97
+
116
98
  original_text = segment.dup
117
99
  segment.replace updated_text
118
100
 
@@ -177,8 +159,31 @@ module Transformed
177
159
  end
178
160
  end
179
161
 
180
- def self.ansi(text, entities, colors = nil)
181
-
162
+ #def self.sort(segments)
163
+ # segments.compact.sort do |a,b|
164
+ # case
165
+ # when ((a.nil? && b.nil?) || (a.offset.nil? && b.offset.nil?))
166
+ # 0
167
+ # when (a.nil? || a.offset.nil?)
168
+ # -1
169
+ # when (b.nil? || b.offset.nil?)
170
+ # +1
171
+ # # Non-overlap
172
+ # when (a.end < b.offset.to_i || b.end < a.offset.to_i)
173
+ # b.offset <=> a.offset
174
+ # # b includes a
175
+ # when (a.offset.to_i >= b.offset.to_i && a.end <= b.end)
176
+ # -1
177
+ # # b includes a
178
+ # when (b.offset.to_i >= a.offset.to_i && b.end <= a.end)
179
+ # +1
180
+ # # Overlap
181
+ # when (a.offset.to_i > b.offset.to_i && a.end > b.end || b.offset.to_i > a.offset.to_i && b.end > a.end)
182
+ # b.length <=> a.length
183
+ # else
184
+ # raise "Unexpected case in sort: #{a.range} - #{b.range}"
185
+ # end
186
+ # end
187
+ #end
182
188
 
183
- end
184
189
  end
@@ -8,6 +8,7 @@ Workflow.require_workflow "TextMining"
8
8
 
9
9
  require 'rbbt/entity/pmid'
10
10
  require 'rbbt/entity/document'
11
+ require 'rbbt/corpus'
11
12
  require 'test/unit'
12
13
 
13
14
 
@@ -13,6 +13,7 @@ class TestAbner < Test::Unit::TestCase
13
13
  assert(mentions.include? mention)
14
14
  }
15
15
  rescue
16
+ Log.exception $!
16
17
  end
17
18
  end
18
19
  end
@@ -11,6 +11,7 @@ class TestLinnaeus < Test::Unit::TestCase
11
11
  assert(mentions.include? mention)
12
12
  }
13
13
  rescue
14
+ Log.exception $!
14
15
  end
15
16
  end
16
17
  end
@@ -29,7 +29,6 @@ sentence. This is
29
29
  another sentence.
30
30
  EOF
31
31
 
32
- iii OpenNLP.sentence_split_detector.sentDetect(text)
33
32
  assert_equal 5, OpenNLP.sentence_split_detector.sentDetect(text).length
34
33
 
35
34
  assert_equal 5, OpenNLP.sentence_splitter(text).length
@@ -0,0 +1,33 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../../..', 'test_helper.rb')
2
+ require 'rbbt/text/document'
3
+ require 'rbbt/text/corpus'
4
+ require 'rbbt/text/corpus/sources/pmid'
5
+
6
+ class TestCorpusPMID < Test::Unit::TestCase
7
+ def setup
8
+ Log.severity = 0
9
+ Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
10
+
11
+ Corpus::Document.define :words do
12
+ words = self.text.split(" ")
13
+ Segment.align(self.text, words)
14
+ end
15
+
16
+ Corpus::Document.define :genes do
17
+ require 'rbbt/ner/banner'
18
+ Banner.new.match(self.text)
19
+ end
20
+
21
+ Corpus::Document.persist_in_global_tsv("genes")
22
+ Corpus::Document.persist_in_global_tsv(:words)
23
+ end
24
+
25
+ def test_query
26
+ docids = Document.corpus.add_pubmed_query("SARS-Cov-2", 2000, :abstract)
27
+
28
+ docids.each do |docid|
29
+ iif Document.corpus.docid(docid).text
30
+ end
31
+ end
32
+ end
33
+