rbbt-text 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '009cfce2ce954c03db5c09d0bd6f5d25bf59d508776d7370bb6bd0fb3a135f36'
4
- data.tar.gz: 3d11d2a5934512958d10dbdfad5e22a9a2481b332c985ab1e2c8e92427d6f375
3
+ metadata.gz: c2a24d8e7faf30d53e41a00a27f6145e8e9f18f0c10af57cdddaea0ee18c35d6
4
+ data.tar.gz: 3475006965110391e35151cd1b5368028dacf467aa276f8eb68fce3320be1122
5
5
  SHA512:
6
- metadata.gz: e9338d4b54d2b66efda11dee3d37366c4f4ae78bde80f0abc1016b34c928e1db857ad73f33ba1da611ad232513498430736c46134a902b3930a8f832afed3e09
7
- data.tar.gz: 0cdeeee67636d4e0b0714334b3c187cb0f5ea5c7363fe27fc84d438643a0d6f204413a4dd5d99c8c43d847539320c484fde2b5300b298cf9cc782148d98802ee
6
+ metadata.gz: da40a039a4792eb5e7fa00270870279221c74dcbf51df1b5278b247496fefbfa888a87b7ab19f05676644c51a01177eb49e229cb0156fe7f0190dd4933d41e24
7
+ data.tar.gz: a32fca5f21a987dcbb6b5541015cc33879330e6f1ef7c4a28e75debe5bdd1dc8bf7b98bfc91d828e605f29868aa972b55cd59bb4f86e66d2fb0cfea31fac2ae0
@@ -0,0 +1,46 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+ require 'rbbt/document/annotation'
4
+
5
+ module DocID
6
+ extend Entity
7
+ self.annotation :corpus
8
+
9
+ class << self
10
+ attr_accessor :default_corpus
11
+ end
12
+
13
+ def corpus
14
+ annotation_values[:corpus] || DocID.default_corpus
15
+ end
16
+
17
+ property :to do |type|
18
+ namespace, code = self.split(":")
19
+ DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
20
+ end
21
+
22
+ def document
23
+ text = self.corpus[self]
24
+ namespace, id, type = self.split(":")
25
+ Document.setup(text, namespace, id, type, :corpus => corpus)
26
+ end
27
+ end
28
+
29
+ module Document
30
+ extend Entity
31
+ self.annotation :namespace, :code, :type, :corpus
32
+
33
+ property :docid do |corpus=nil|
34
+ digest = Misc.digest(self)
35
+ corpus = self.corpus if corpus.nil?
36
+
37
+ DocID.setup([namespace, code, type, digest] * ":", :corpus => corpus)
38
+ end
39
+
40
+ property :to do |type|
41
+ docid.to(type).document
42
+ end
43
+
44
+ alias id docid
45
+ end
46
+
@@ -0,0 +1,42 @@
1
+ require 'rbbt/segment/annotation'
2
+
3
+ module Document
4
+ def self.define(type, &block)
5
+ send :property, type do
6
+ segments = self.instance_exec &block
7
+
8
+ Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
9
+
10
+ segments.each do |segment|
11
+ SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
12
+ end
13
+
14
+ docid = self.docid
15
+ segments.each{|s| s.docid = docid if s.docid.nil? }
16
+
17
+ segments
18
+ end
19
+ end
20
+
21
+ def self.define_multiple(type, &block)
22
+ send :property, type => :multiple do |list|
23
+ doc_segments = self.instance_exec list, &block
24
+
25
+ doc_segments = doc_segments.chunked_values_at(self) if Hash === doc_segments
26
+
27
+ doc_segments.each_with_index do |segments,i|
28
+ document = list[i]
29
+ Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
30
+
31
+ segments.each do |segment|
32
+ SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
33
+ end
34
+
35
+ docid = document.docid
36
+ segments.each{|s| s.docid = docid if s.docid.nil? }
37
+
38
+ segments
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,38 @@
1
+ require 'rbbt-util'
2
+
3
+ module Document::Corpus
4
+
5
+ def self.setup(corpus)
6
+ corpus.extend Document::Corpus
7
+ end
8
+
9
+ def add_document(document)
10
+ self[document.docid] = document
11
+ end
12
+
13
+ def [](*args)
14
+ docid, *rest = args
15
+ res = super(*args)
16
+ return res if args.length > 1
17
+ namespace, id, type = docid.split(":")
18
+
19
+ if res.nil?
20
+ if Document::Corpus.claims.include?(namespace.to_s)
21
+ res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
22
+ end
23
+ end
24
+
25
+ Document.setup(res, namespace, id, type, self) unless res.nil?
26
+
27
+ res
28
+ end
29
+
30
+ class << self
31
+ attr_accessor :claims
32
+ def claim(namespace, &block)
33
+ @claims = {}
34
+ @claims[namespace.to_s] = block
35
+ end
36
+ end
37
+
38
+ end
@@ -0,0 +1,33 @@
1
+ require 'rbbt/sources/pubmed'
2
+
3
+ module Document::Corpus
4
+ def add_pmid(pmid, type = nil)
5
+ pmids = Array === pmid ? pmid : [pmid]
6
+ type = nil if String === type and type.empty?
7
+
8
+ res = PubMed.get_article(pmids).collect do |pmid, article|
9
+ Log.debug "Loading pmid #{pmid}"
10
+ document = if type.nil? || type.to_sym == :abstract
11
+ Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
12
+ elsif type.to_sym == :title
13
+ Document.setup(article.title, :PMID, pmid, :title, self)
14
+ else
15
+ raise "No FullText available for #{ pmid }" if article.full_text.nil?
16
+ Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
17
+ end
18
+ add_document(document)
19
+ end
20
+
21
+ Document.setup(res)
22
+ end
23
+
24
+ def add_pubmed_query(query, max = 3000, type = nil)
25
+ pmids = PubMed.query(query, max)
26
+ add_pmid(pmids, type)
27
+ end
28
+
29
+ self.claim "PMID" do |id, type|
30
+ Log.debug "Claiming #{id}"
31
+ self.add_pmid(id, type).first
32
+ end
33
+ end
@@ -1,6 +1,6 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/text/segment/named_entity'
3
- require 'rbbt/text/segment/segmented'
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/named_entity'
3
+ require 'rbbt/segment/segmented'
4
4
 
5
5
  class NER
6
6
  def entities(text, protect = false, *args)
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'rbbt/resource'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'rbbt/ner/NER'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
 
6
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
@@ -1,4 +1,4 @@
1
- require 'rbbt/text/segment/named_entity'
1
+ require 'rbbt/segment/named_entity'
2
2
  require 'rbbt/text/segment/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/text/segment'
4
3
  require 'rbbt/ner/NER'
5
4
  require 'rbbt/util/log'
6
5
 
@@ -8,7 +7,7 @@ class ChemicalTagger < NER
8
7
  Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
9
8
 
10
9
  def self.init
11
- ENV["CLASSPATH"] = ENV["CLASSPATH"].split(":").reverse * ":"
10
+ ENV["CLASSPATH"] = [ENV["CLASSPATH"].split(":"), Rbbt.software.opt.ChemicalTagger.produce.glob("*.jar").first].reverse * ":"
12
11
  Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
13
12
  @@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
14
13
  end
@@ -1,4 +1,6 @@
1
1
  require 'rbbt-util'
2
+ require 'rbbt/segment'
3
+ require 'rbbt/segment/named_entity'
2
4
  module GNormPlus
3
5
 
4
6
  Rbbt.claim Rbbt.software.opt.GNormPlus, :install do
@@ -35,8 +37,8 @@ module GNormPlus
35
37
  HomologeneID = False
36
38
  Normalization2Protein = False
37
39
  ShowUnNormalizedMention = False
40
+ IgnoreNER = False
38
41
  DeleteTmp = True
39
- IgnoreNER = True
40
42
  EOF
41
43
 
42
44
  def self.process(texts)
@@ -69,7 +71,7 @@ EOF
69
71
  tsv = TSV.setup({}, :key_field => key_field, :fields => ["Entities"], :type => :flat)
70
72
  Dir.glob("output/*.txt").each do |file|
71
73
  name = File.basename(file).sub(".txt",'')
72
- entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
74
+ entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '·').split("\t")[1..-1] * ":"}
73
75
  tsv[name] = entities
74
76
  end
75
77
 
@@ -79,6 +81,21 @@ EOF
79
81
  end
80
82
  end
81
83
  end
84
+
85
+ def self.entities(texts)
86
+ res = {}
87
+ process(texts).each do |name, entities|
88
+
89
+ segments = entities.collect do |entity|
90
+ start, eend, literal, type, code = entity.split(":")
91
+ literal.gsub!('·',':')
92
+
93
+ NamedEntity.setup(literal, :offset => start.to_i, :entity_type => type, :code => code)
94
+ end
95
+
96
+ res[name] = segments
97
+ end
98
+ end
82
99
  end
83
100
 
84
101
  if __FILE__ == $0
@@ -1,12 +1,12 @@
1
1
  require 'rjb'
2
2
  require 'rbbt'
3
- require 'rbbt/text/segment/named_entity'
3
+ require 'rbbt/segment/named_entity'
4
4
 
5
5
  module Linnaeus
6
6
 
7
7
  Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
8
8
 
9
- ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
9
+ ARGS = ["--properties", Rbbt.software.opt.Linnaeus.produce["species-proxy/properties.conf"].find]
10
10
 
11
11
 
12
12
  Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx2G']) unless Rjb.loaded?
@@ -31,7 +31,7 @@ module Linnaeus
31
31
  init unless defined? @@Matcher
32
32
 
33
33
  @@Matcher.match(text).toArray().collect do |mention|
34
- NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
34
+ NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
35
35
  end
36
36
  end
37
37
  end
@@ -1,8 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/misc'
3
3
  require 'rbbt/tsv'
4
- require 'rbbt/text/segment'
5
- require 'rbbt/text/segment/token'
4
+ require 'rbbt/segment'
5
+ require 'rbbt/segment/token'
6
6
  require 'rbbt/ner/NER'
7
7
  require 'inline'
8
8
 
@@ -150,7 +150,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
150
150
 
151
151
  def match(text)
152
152
  matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
153
- NamedEntity.setup(name, offset, type, code)
153
+ NamedEntity.setup(name, :offset => offset, :entity_type => type, :code => code)
154
154
  }
155
155
 
156
156
  if case_insensitive
@@ -1,7 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/text/segment'
5
4
  require 'rbbt/ner/NER'
6
5
  require 'rbbt/util/log'
7
6
 
@@ -53,7 +52,7 @@ class OSCAR3 < NER
53
52
  next unless type.nil? or type.include? mention_type
54
53
  score = memm ? entities.get(key).to_string.to_f : nil
55
54
 
56
- NamedEntity.setup mention, rstart.to_i + offset, mention_type, nil, score
55
+ NamedEntity.setup mention, :offset => rstart.to_i + offset, :entity_type => mention_type, :score => score
57
56
 
58
57
  mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
59
58
  end
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
@@ -25,7 +25,7 @@ class OSCAR4 < NER
25
25
  @@tagger ||= @@OSCAR.new()
26
26
  end
27
27
 
28
- def self.match(text, type = nil)
28
+ def self.match(text, protect = false, type = nil)
29
29
  self.init
30
30
 
31
31
  return [] if text.nil? or text.strip.empty?
@@ -46,7 +46,7 @@ class OSCAR4 < NER
46
46
 
47
47
  next unless entity.getType.toString == type unless type.nil?
48
48
 
49
- NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
49
+ NamedEntity.setup mention, :offset => entity.getStart, :entity_type => entity.getType, :code => inchi, :score => entity.getConfidence
50
50
 
51
51
  result << mention
52
52
  end
@@ -1,7 +1,7 @@
1
- require 'rbbt/text/segment/named_entity'
2
- require 'rbbt/text/segment/segmented'
3
- require 'rbbt/text/segment/transformed'
4
- require 'rbbt/text/segment/relationship'
1
+ require 'rbbt/segment/named_entity'
2
+ require 'rbbt/segment/segmented'
3
+ require 'rbbt/segment/transformed'
4
+ #require 'rbbt/segment/relationship'
5
5
  require 'rbbt/ner/regexpNER'
6
6
  require 'rbbt/ner/token_trieNER'
7
7
  require 'rbbt/nlp/nlp'
@@ -14,7 +14,8 @@ class PatternRelExt
14
14
  regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
15
15
  segments = sentence.segments
16
16
  segments = segments.values.flatten if Hash === segments
17
- Transformed.with_transform(sentence, segments, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
17
+ Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
18
+ ppp sentence
18
19
  regexpNER.entities(sentence)
19
20
  end
20
21
  end
@@ -1,4 +1,3 @@
1
- require 'rbbt/text/segment'
2
1
  require 'rbbt/ner/NER'
3
2
  require 'rbbt/util/simpleDSL'
4
3
 
@@ -23,7 +22,7 @@ class RegExpNER < NER
23
22
  end
24
23
 
25
24
  if match and not match.empty?
26
- NamedEntity.setup(match, start + pre.length, type)
25
+ NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
27
26
  matches << match
28
27
  end
29
28
 
@@ -1,8 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/tsv'
3
- require 'rbbt/text/segment'
4
- require 'rbbt/text/segment/token'
3
+ require 'rbbt/segment'
5
4
  require 'rbbt/ner/NER'
5
+ require 'rbbt/segment/token'
6
6
 
7
7
  class TokenTrieNER < NER
8
8
  def self.clean(token)
@@ -16,13 +16,13 @@ class TokenTrieNER < NER
16
16
  def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
17
17
  if no_clean
18
18
  if extend_to_token
19
- Token.setup(clean(token), start, token)
19
+ Token.setup(token, :offset => start, :original => token)
20
20
  else
21
21
  token
22
22
  end
23
23
  else
24
24
  if extend_to_token
25
- Token.setup(clean(token), start, token)
25
+ Token.setup(clean(token), :offset => start, :original => token)
26
26
  else
27
27
  clean(token)
28
28
  end
@@ -137,7 +137,7 @@ class TokenTrieNER < NER
137
137
  tmp_index = {}
138
138
  hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
139
139
  names = Array === names ? names : [names]
140
- names.flatten! if Array === names.first and not Token === names.first.first
140
+ names.flatten! if Array === names.first and not Segment === names.first.first
141
141
 
142
142
  if names.empty?
143
143
  names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
@@ -237,7 +237,7 @@ class TokenTrieNER < NER
237
237
  match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
238
238
  }
239
239
 
240
- NamedEntity.setup(match, match_tokens.first.offset, type, codes)
240
+ NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
241
241
  end
242
242
 
243
243
  attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean