rbbt-text 1.1.8 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +3 -3
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +2 -2
- data/lib/rbbt/ner/chemical_tagger.rb +1 -1
- data/lib/rbbt/ner/linnaeus.rb +1 -1
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +2 -2
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +1 -1
- data/lib/rbbt/ner/patterns.rb +4 -4
- data/lib/rbbt/ner/regexpNER.rb +1 -1
- data/lib/rbbt/ner/token_trieNER.rb +2 -2
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/nlp.rb +2 -2
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
- data/lib/rbbt/{corpus → text}/corpus.rb +51 -11
- data/lib/rbbt/text/corpus/document.rb +361 -0
- data/lib/rbbt/text/corpus/document_repo.rb +68 -0
- data/lib/rbbt/text/corpus/sources/pmid.rb +34 -0
- data/lib/rbbt/text/document.rb +39 -0
- data/lib/rbbt/{ner → text}/segment.rb +11 -6
- data/lib/rbbt/{ner → text}/segment/docid.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/named_entity.rb +2 -2
- data/lib/rbbt/{ner → text}/segment/relationship.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/segmented.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/token.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/transformed.rb +47 -42
- data/test/rbbt/entity/test_document.rb +1 -0
- data/test/rbbt/ner/test_abner.rb +1 -0
- data/test/rbbt/ner/test_linnaeus.rb +1 -0
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +0 -1
- data/test/rbbt/text/corpus/sources/test_pmid.rb +33 -0
- data/test/rbbt/text/corpus/test_document.rb +52 -0
- data/test/rbbt/{ner → text}/segment/test_named_entity.rb +2 -2
- data/test/rbbt/{ner → text}/segment/test_relationship.rb +0 -0
- data/test/rbbt/{ner → text}/segment/test_segmented.rb +1 -1
- data/test/rbbt/{ner → text}/segment/test_transformed.rb +96 -3
- data/test/rbbt/text/test_corpus.rb +34 -0
- data/test/rbbt/text/test_document.rb +58 -0
- data/test/rbbt/{ner → text}/test_segment.rb +2 -2
- data/test/test_helper.rb +3 -3
- metadata +32 -24
- data/lib/rbbt/corpus/document.rb +0 -266
- data/lib/rbbt/corpus/document_repo.rb +0 -137
- data/lib/rbbt/corpus/sources/pubmed.rb +0 -27
- data/lib/rbbt/entity/document.rb +0 -75
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'rbbt/util/misc'
|
2
|
+
require 'tokyocabinet'
|
3
|
+
|
4
|
+
class Corpus
|
5
|
+
module DocumentRepo
|
6
|
+
class OpenError < StandardError;end
|
7
|
+
class KeyFormatError < StandardError;end
|
8
|
+
|
9
|
+
TC_CONNECTIONS = {}
|
10
|
+
def self.open_tokyocabinet(path, write)
|
11
|
+
database = Persist.open_tokyocabinet(path, write, :single, TokyoCabinet::BDB)
|
12
|
+
database.extend DocumentRepo
|
13
|
+
database
|
14
|
+
end
|
15
|
+
|
16
|
+
def docid2fields(docid)
|
17
|
+
docid.split(":", -1).values_at 0,1,2,3
|
18
|
+
end
|
19
|
+
|
20
|
+
def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
|
21
|
+
[namespace, id, type, hash] * ":"
|
22
|
+
end
|
23
|
+
|
24
|
+
def docid(docid)
|
25
|
+
get(docid)
|
26
|
+
end
|
27
|
+
|
28
|
+
def add(text, namespace, id, type, hash)
|
29
|
+
docid = fields2docid(namespace, id, type, hash)
|
30
|
+
|
31
|
+
return docid if self.include?(docid)
|
32
|
+
|
33
|
+
write_and_close do
|
34
|
+
self[docid] = text
|
35
|
+
end
|
36
|
+
|
37
|
+
docid
|
38
|
+
end
|
39
|
+
|
40
|
+
def find(namespace=nil, id = nil, type = nil, hash = nil)
|
41
|
+
case
|
42
|
+
when namespace.nil?
|
43
|
+
self.keys
|
44
|
+
when id.nil?
|
45
|
+
range_start = [namespace] * ":" + ':'
|
46
|
+
range_end = [namespace] * ":" + ';'
|
47
|
+
self.range(range_start, true, range_end, false)
|
48
|
+
when (type and hash)
|
49
|
+
[[namespace, id, type, hash] * ":"]
|
50
|
+
when hash
|
51
|
+
[[namespace, id, "", hash] * ":"]
|
52
|
+
when type
|
53
|
+
range_start = [namespace, id, type] * ":" + ':'
|
54
|
+
range_end = [namespace, id, type] * ":" + ';'
|
55
|
+
self.range(range_start, true, range_end, false)
|
56
|
+
else
|
57
|
+
range_start = [namespace, id] * ":" + ':'
|
58
|
+
range_end = [namespace, id] * ":" + ';'
|
59
|
+
self.range(range_start, true, range_end, false)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def find_docid(docid)
|
64
|
+
find(*docid2fields(docid))
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'rbbt/sources/pubmed'
|
2
|
+
|
3
|
+
class Corpus
|
4
|
+
|
5
|
+
NAMESPACES = {} unless defined? NAMESPACES
|
6
|
+
NAMESPACES[:pubmed] = :add_pmid
|
7
|
+
|
8
|
+
def add_pmid(pmid, type = nil)
|
9
|
+
pmids = Array === pmid ? pmid : [pmid]
|
10
|
+
type = nil if String === type and type.empty?
|
11
|
+
|
12
|
+
PubMed.get_article(pmids).collect do |pmid, article|
|
13
|
+
Log.debug "Loading pmid #{pmid}"
|
14
|
+
if type.nil? || type.to_sym == :abstract
|
15
|
+
add_document(article.abstract || "", :PMID, pmid, :abstract)
|
16
|
+
elsif type.to_sym == :title
|
17
|
+
add_document(article.title, :PMID, pmid, :title)
|
18
|
+
else
|
19
|
+
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
20
|
+
add_document(article.full_text, :PMID, pmid, :fulltext)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def add_pubmed_query(query, max = 3000, type = nil)
|
26
|
+
pmids = PubMed.query(query, max)
|
27
|
+
add_pmid(pmids, type)
|
28
|
+
end
|
29
|
+
|
30
|
+
self.claim "PMID" do |id, type|
|
31
|
+
Log.debug "Claiming #{id}"
|
32
|
+
self.add_pmid(id, type)
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/entity'
|
3
|
+
|
4
|
+
require 'rbbt/text/corpus'
|
5
|
+
|
6
|
+
module Document
|
7
|
+
extend Entity
|
8
|
+
class << self
|
9
|
+
attr_accessor :corpus
|
10
|
+
end
|
11
|
+
|
12
|
+
property :document => :single do
|
13
|
+
Document.corpus.docid(self)
|
14
|
+
end
|
15
|
+
|
16
|
+
property :type => :single do |type|
|
17
|
+
self.annotate((self.split(":").values_at(0,1)) * ":" + ":" + type.to_s)
|
18
|
+
end
|
19
|
+
|
20
|
+
property :title => :single do
|
21
|
+
type(:title).text
|
22
|
+
end
|
23
|
+
|
24
|
+
property :full_text => :single do
|
25
|
+
type(:full_text).text
|
26
|
+
end
|
27
|
+
|
28
|
+
property :abstract => :single do
|
29
|
+
type(:abstract).text
|
30
|
+
end
|
31
|
+
|
32
|
+
property :text => :single do
|
33
|
+
document.text
|
34
|
+
end
|
35
|
+
|
36
|
+
property :entities => :single do |type,*args|
|
37
|
+
document.method(type).call *args
|
38
|
+
end
|
39
|
+
end
|
@@ -77,9 +77,14 @@ module Segment
|
|
77
77
|
self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.end
|
78
78
|
end
|
79
79
|
|
80
|
+
def overlaps(segments)
|
81
|
+
segments.select{|s| self.overlaps?(s)}
|
82
|
+
end
|
83
|
+
|
84
|
+
|
80
85
|
def self.collisions(main, secondary)
|
81
|
-
|
82
|
-
|
86
|
+
secondary.select do |ss|
|
87
|
+
main.select{|ms| ms.overlaps? ss }.any?
|
83
88
|
end
|
84
89
|
end
|
85
90
|
|
@@ -320,7 +325,7 @@ module Segment
|
|
320
325
|
tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
|
321
326
|
|
322
327
|
segments.each do |segment|
|
323
|
-
tsv[segment.
|
328
|
+
tsv[segment.object_id.to_s] = self.tsv_values_for_segment(segment, fields)
|
324
329
|
end
|
325
330
|
|
326
331
|
tsv
|
@@ -343,8 +348,8 @@ module Segment
|
|
343
348
|
[offset, self.end] * ".."
|
344
349
|
end
|
345
350
|
|
346
|
-
def ==(other)
|
347
|
-
|
348
|
-
end
|
351
|
+
#def ==(other)
|
352
|
+
# self.text == other.text
|
353
|
+
#end
|
349
354
|
end
|
350
355
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/text/segment'
|
2
2
|
require 'rbbt/entity'
|
3
3
|
|
4
4
|
module NamedEntity
|
@@ -32,7 +32,7 @@ Score: #{score.inspect}
|
|
32
32
|
format, entity = code.split(":")
|
33
33
|
entity, format = format, nil if entity.nil?
|
34
34
|
|
35
|
-
if defined?
|
35
|
+
if defined?(Entity) && Entity.formats.include?(type) or Entity.formats.include?(format)
|
36
36
|
params ||= {}
|
37
37
|
params[:format] = format if format and params[:format].nil?
|
38
38
|
mod = (Entity.formats[type] || Entity.format[entity])
|
@@ -1,10 +1,12 @@
|
|
1
1
|
require 'rbbt/util/misc'
|
2
|
-
require 'rbbt/
|
2
|
+
require 'rbbt/text/segment'
|
3
3
|
|
4
4
|
module Transformed
|
5
5
|
|
6
6
|
def self.transform(text, segments, replacement = nil, &block)
|
7
7
|
|
8
|
+
block = replacement if Proc === replacement
|
9
|
+
|
8
10
|
text.extend Transformed
|
9
11
|
text.replace_segments(segments, replacement, &block)
|
10
12
|
|
@@ -24,68 +26,44 @@ module Transformed
|
|
24
26
|
end
|
25
27
|
|
26
28
|
attr_accessor :transformed_segments, :transformation_stack
|
27
|
-
|
29
|
+
|
28
30
|
def shift(segment_o)
|
29
31
|
begin_shift = 0
|
30
32
|
end_shift = 0
|
31
33
|
|
34
|
+
text_offset = self.respond_to?(:offset)? self.offset.to_i : 0
|
32
35
|
@transformed_segments.sort_by{|id, info| info.last}.each{|id,info|
|
33
|
-
pseg_o, diff = info
|
36
|
+
pseg_o, diff, utext, pseg_u, index = info
|
37
|
+
|
38
|
+
pseg_u = ((pseg_u.begin + text_offset)..(pseg_u.last + text_offset))
|
34
39
|
|
35
40
|
case
|
36
41
|
# Before
|
37
|
-
when segment_o.last + end_shift <
|
42
|
+
when segment_o.last + end_shift < pseg_u.begin
|
38
43
|
# After
|
39
|
-
when (segment_o.begin + begin_shift >
|
44
|
+
when (segment_o.begin + begin_shift > pseg_u.last)
|
40
45
|
begin_shift += diff
|
41
46
|
end_shift += diff
|
42
47
|
# Includes
|
43
|
-
when (segment_o.begin + begin_shift <=
|
48
|
+
when (segment_o.begin + begin_shift <= pseg_u.begin and segment_o.last + end_shift >= pseg_u.last)
|
44
49
|
end_shift += diff
|
45
50
|
# Inside
|
46
|
-
when (segment_o.begin + begin_shift >=
|
51
|
+
when (segment_o.begin + begin_shift >= pseg_u.begin and segment_o.last + end_shift <= pseg_u.last)
|
47
52
|
return nil
|
48
53
|
# Overlaps start
|
49
|
-
when (segment_o.begin + begin_shift <=
|
54
|
+
when (segment_o.begin + begin_shift <= pseg_u.begin and segment_o.last + end_shift <= pseg_u.last)
|
50
55
|
return nil
|
51
56
|
# Overlaps end
|
52
|
-
when (segment_o.begin + begin_shift >=
|
57
|
+
when (segment_o.begin + begin_shift >= pseg_u.begin and segment_o.last + end_shift >= pseg_u.last)
|
53
58
|
return nil
|
54
|
-
|
55
|
-
raise "Unknown overlaps: #{segment_o.inspect} - #{
|
59
|
+
else
|
60
|
+
raise "Unknown overlaps: #{segment_o.inspect} - #{pseg_u.inspect}"
|
56
61
|
end
|
57
62
|
}
|
58
63
|
|
59
64
|
[begin_shift, end_shift]
|
60
65
|
end
|
61
66
|
|
62
|
-
#def self.sort(segments)
|
63
|
-
# segments.compact.sort do |a,b|
|
64
|
-
# case
|
65
|
-
# when ((a.nil? && b.nil?) || (a.offset.nil? && b.offset.nil?))
|
66
|
-
# 0
|
67
|
-
# when (a.nil? || a.offset.nil?)
|
68
|
-
# -1
|
69
|
-
# when (b.nil? || b.offset.nil?)
|
70
|
-
# +1
|
71
|
-
# # Non-overlap
|
72
|
-
# when (a.end < b.offset.to_i || b.end < a.offset.to_i)
|
73
|
-
# b.offset <=> a.offset
|
74
|
-
# # b includes a
|
75
|
-
# when (a.offset.to_i >= b.offset.to_i && a.end <= b.end)
|
76
|
-
# -1
|
77
|
-
# # b includes a
|
78
|
-
# when (b.offset.to_i >= a.offset.to_i && b.end <= a.end)
|
79
|
-
# +1
|
80
|
-
# # Overlap
|
81
|
-
# when (a.offset.to_i > b.offset.to_i && a.end > b.end || b.offset.to_i > a.offset.to_i && b.end > a.end)
|
82
|
-
# b.length <=> a.length
|
83
|
-
# else
|
84
|
-
# raise "Unexpected case in sort: #{a.range} - #{b.range}"
|
85
|
-
# end
|
86
|
-
# end
|
87
|
-
#end
|
88
|
-
|
89
67
|
def replace_segments(segments, replacement = nil, &block)
|
90
68
|
@transformed_segments ||= {}
|
91
69
|
@transformation_stack ||= []
|
@@ -93,8 +71,9 @@ module Transformed
|
|
93
71
|
|
94
72
|
segments = [segments] unless Array === segments
|
95
73
|
orig_length = self.length
|
96
|
-
Segment.
|
74
|
+
Segment.clean_sort(segments).each do |segment|
|
97
75
|
next if segment.offset.nil?
|
76
|
+
|
98
77
|
shift = shift segment.range
|
99
78
|
|
100
79
|
next if shift.nil?
|
@@ -102,6 +81,7 @@ module Transformed
|
|
102
81
|
shift_begin, shift_end = shift
|
103
82
|
|
104
83
|
text_offset = self.respond_to?(:offset)? self.offset.to_i : 0
|
84
|
+
|
105
85
|
updated_begin = segment.offset.to_i + shift_begin - text_offset
|
106
86
|
updated_end = segment.range.last + shift_end - text_offset
|
107
87
|
|
@@ -113,6 +93,8 @@ module Transformed
|
|
113
93
|
next
|
114
94
|
end
|
115
95
|
|
96
|
+
#raise "error '#{segment}' => '#{updated_text}'" if updated_text != segment
|
97
|
+
|
116
98
|
original_text = segment.dup
|
117
99
|
segment.replace updated_text
|
118
100
|
|
@@ -177,8 +159,31 @@ module Transformed
|
|
177
159
|
end
|
178
160
|
end
|
179
161
|
|
180
|
-
def self.
|
181
|
-
|
162
|
+
#def self.sort(segments)
|
163
|
+
# segments.compact.sort do |a,b|
|
164
|
+
# case
|
165
|
+
# when ((a.nil? && b.nil?) || (a.offset.nil? && b.offset.nil?))
|
166
|
+
# 0
|
167
|
+
# when (a.nil? || a.offset.nil?)
|
168
|
+
# -1
|
169
|
+
# when (b.nil? || b.offset.nil?)
|
170
|
+
# +1
|
171
|
+
# # Non-overlap
|
172
|
+
# when (a.end < b.offset.to_i || b.end < a.offset.to_i)
|
173
|
+
# b.offset <=> a.offset
|
174
|
+
# # b includes a
|
175
|
+
# when (a.offset.to_i >= b.offset.to_i && a.end <= b.end)
|
176
|
+
# -1
|
177
|
+
# # b includes a
|
178
|
+
# when (b.offset.to_i >= a.offset.to_i && b.end <= a.end)
|
179
|
+
# +1
|
180
|
+
# # Overlap
|
181
|
+
# when (a.offset.to_i > b.offset.to_i && a.end > b.end || b.offset.to_i > a.offset.to_i && b.end > a.end)
|
182
|
+
# b.length <=> a.length
|
183
|
+
# else
|
184
|
+
# raise "Unexpected case in sort: #{a.range} - #{b.range}"
|
185
|
+
# end
|
186
|
+
# end
|
187
|
+
#end
|
182
188
|
|
183
|
-
end
|
184
189
|
end
|
data/test/rbbt/ner/test_abner.rb
CHANGED
@@ -0,0 +1,33 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/text/document'
|
3
|
+
require 'rbbt/text/corpus'
|
4
|
+
require 'rbbt/text/corpus/sources/pmid'
|
5
|
+
|
6
|
+
class TestCorpusPMID < Test::Unit::TestCase
|
7
|
+
def setup
|
8
|
+
Log.severity = 0
|
9
|
+
Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
|
10
|
+
|
11
|
+
Corpus::Document.define :words do
|
12
|
+
words = self.text.split(" ")
|
13
|
+
Segment.align(self.text, words)
|
14
|
+
end
|
15
|
+
|
16
|
+
Corpus::Document.define :genes do
|
17
|
+
require 'rbbt/ner/banner'
|
18
|
+
Banner.new.match(self.text)
|
19
|
+
end
|
20
|
+
|
21
|
+
Corpus::Document.persist_in_global_tsv("genes")
|
22
|
+
Corpus::Document.persist_in_global_tsv(:words)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_query
|
26
|
+
docids = Document.corpus.add_pubmed_query("SARS-Cov-2", 2000, :abstract)
|
27
|
+
|
28
|
+
docids.each do |docid|
|
29
|
+
iif Document.corpus.docid(docid).text
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|