rbbt-text 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '009cfce2ce954c03db5c09d0bd6f5d25bf59d508776d7370bb6bd0fb3a135f36'
4
- data.tar.gz: 3d11d2a5934512958d10dbdfad5e22a9a2481b332c985ab1e2c8e92427d6f375
3
+ metadata.gz: c2a24d8e7faf30d53e41a00a27f6145e8e9f18f0c10af57cdddaea0ee18c35d6
4
+ data.tar.gz: 3475006965110391e35151cd1b5368028dacf467aa276f8eb68fce3320be1122
5
5
  SHA512:
6
- metadata.gz: e9338d4b54d2b66efda11dee3d37366c4f4ae78bde80f0abc1016b34c928e1db857ad73f33ba1da611ad232513498430736c46134a902b3930a8f832afed3e09
7
- data.tar.gz: 0cdeeee67636d4e0b0714334b3c187cb0f5ea5c7363fe27fc84d438643a0d6f204413a4dd5d99c8c43d847539320c484fde2b5300b298cf9cc782148d98802ee
6
+ metadata.gz: da40a039a4792eb5e7fa00270870279221c74dcbf51df1b5278b247496fefbfa888a87b7ab19f05676644c51a01177eb49e229cb0156fe7f0190dd4933d41e24
7
+ data.tar.gz: a32fca5f21a987dcbb6b5541015cc33879330e6f1ef7c4a28e75debe5bdd1dc8bf7b98bfc91d828e605f29868aa972b55cd59bb4f86e66d2fb0cfea31fac2ae0
@@ -0,0 +1,46 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+ require 'rbbt/document/annotation'
4
+
5
+ module DocID
6
+ extend Entity
7
+ self.annotation :corpus
8
+
9
+ class << self
10
+ attr_accessor :default_corpus
11
+ end
12
+
13
+ def corpus
14
+ annotation_values[:corpus] || DocID.default_corpus
15
+ end
16
+
17
+ property :to do |type|
18
+ namespace, code = self.split(":")
19
+ DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
20
+ end
21
+
22
+ def document
23
+ text = self.corpus[self]
24
+ namespace, id, type = self.split(":")
25
+ Document.setup(text, namespace, id, type, :corpus => corpus)
26
+ end
27
+ end
28
+
29
+ module Document
30
+ extend Entity
31
+ self.annotation :namespace, :code, :type, :corpus
32
+
33
+ property :docid do |corpus=nil|
34
+ digest = Misc.digest(self)
35
+ corpus = self.corpus if corpus.nil?
36
+
37
+ DocID.setup([namespace, code, type, digest] * ":", :corpus => corpus)
38
+ end
39
+
40
+ property :to do |type|
41
+ docid.to(type).document
42
+ end
43
+
44
+ alias id docid
45
+ end
46
+
@@ -0,0 +1,42 @@
1
+ require 'rbbt/segment/annotation'
2
+
3
+ module Document
4
+ def self.define(type, &block)
5
+ send :property, type do
6
+ segments = self.instance_exec &block
7
+
8
+ Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
9
+
10
+ segments.each do |segment|
11
+ SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
12
+ end
13
+
14
+ docid = self.docid
15
+ segments.each{|s| s.docid = docid if s.docid.nil? }
16
+
17
+ segments
18
+ end
19
+ end
20
+
21
+ def self.define_multiple(type, &block)
22
+ send :property, type => :multiple do |list|
23
+ doc_segments = self.instance_exec list, &block
24
+
25
+ doc_segments = doc_segments.chunked_values_at(self) if Hash === doc_segments
26
+
27
+ doc_segments.each_with_index do |segments,i|
28
+ document = list[i]
29
+ Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
30
+
31
+ segments.each do |segment|
32
+ SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
33
+ end
34
+
35
+ docid = document.docid
36
+ segments.each{|s| s.docid = docid if s.docid.nil? }
37
+
38
+ segments
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,38 @@
1
+ require 'rbbt-util'
2
+
3
+ module Document::Corpus
4
+
5
+ def self.setup(corpus)
6
+ corpus.extend Document::Corpus
7
+ end
8
+
9
+ def add_document(document)
10
+ self[document.docid] = document
11
+ end
12
+
13
+ def [](*args)
14
+ docid, *rest = args
15
+ res = super(*args)
16
+ return res if args.length > 1
17
+ namespace, id, type = docid.split(":")
18
+
19
+ if res.nil?
20
+ if Document::Corpus.claims.include?(namespace.to_s)
21
+ res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
22
+ end
23
+ end
24
+
25
+ Document.setup(res, namespace, id, type, self) unless res.nil?
26
+
27
+ res
28
+ end
29
+
30
+ class << self
31
+ attr_accessor :claims
32
+ def claim(namespace, &block)
33
+ @claims = {}
34
+ @claims[namespace.to_s] = block
35
+ end
36
+ end
37
+
38
+ end
@@ -0,0 +1,33 @@
1
+ require 'rbbt/sources/pubmed'
2
+
3
+ module Document::Corpus
4
+ def add_pmid(pmid, type = nil)
5
+ pmids = Array === pmid ? pmid : [pmid]
6
+ type = nil if String === type and type.empty?
7
+
8
+ res = PubMed.get_article(pmids).collect do |pmid, article|
9
+ Log.debug "Loading pmid #{pmid}"
10
+ document = if type.nil? || type.to_sym == :abstract
11
+ Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
12
+ elsif type.to_sym == :title
13
+ Document.setup(article.title, :PMID, pmid, :title, self)
14
+ else
15
+ raise "No FullText available for #{ pmid }" if article.full_text.nil?
16
+ Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
17
+ end
18
+ add_document(document)
19
+ end
20
+
21
+ Document.setup(res)
22
+ end
23
+
24
+ def add_pubmed_query(query, max = 3000, type = nil)
25
+ pmids = PubMed.query(query, max)
26
+ add_pmid(pmids, type)
27
+ end
28
+
29
+ self.claim "PMID" do |id, type|
30
+ Log.debug "Claiming #{id}"
31
+ self.add_pmid(id, type).first
32
+ end
33
+ end
@@ -1,6 +1,6 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/text/segment/named_entity'
3
- require 'rbbt/text/segment/segmented'
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/named_entity'
3
+ require 'rbbt/segment/segmented'
4
4
 
5
5
  class NER
6
6
  def entities(text, protect = false, *args)
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'rbbt/resource'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'rbbt/ner/NER'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
 
6
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
@@ -1,4 +1,4 @@
1
- require 'rbbt/text/segment/named_entity'
1
+ require 'rbbt/segment/named_entity'
2
2
  require 'rbbt/text/segment/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/text/segment'
4
3
  require 'rbbt/ner/NER'
5
4
  require 'rbbt/util/log'
6
5
 
@@ -8,7 +7,7 @@ class ChemicalTagger < NER
8
7
  Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
9
8
 
10
9
  def self.init
11
- ENV["CLASSPATH"] = ENV["CLASSPATH"].split(":").reverse * ":"
10
+ ENV["CLASSPATH"] = [ENV["CLASSPATH"].split(":"), Rbbt.software.opt.ChemicalTagger.produce.glob("*.jar").first].reverse * ":"
12
11
  Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
13
12
  @@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
14
13
  end
@@ -1,4 +1,6 @@
1
1
  require 'rbbt-util'
2
+ require 'rbbt/segment'
3
+ require 'rbbt/segment/named_entity'
2
4
  module GNormPlus
3
5
 
4
6
  Rbbt.claim Rbbt.software.opt.GNormPlus, :install do
@@ -35,8 +37,8 @@ module GNormPlus
35
37
  HomologeneID = False
36
38
  Normalization2Protein = False
37
39
  ShowUnNormalizedMention = False
40
+ IgnoreNER = False
38
41
  DeleteTmp = True
39
- IgnoreNER = True
40
42
  EOF
41
43
 
42
44
  def self.process(texts)
@@ -69,7 +71,7 @@ EOF
69
71
  tsv = TSV.setup({}, :key_field => key_field, :fields => ["Entities"], :type => :flat)
70
72
  Dir.glob("output/*.txt").each do |file|
71
73
  name = File.basename(file).sub(".txt",'')
72
- entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
74
+ entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '·').split("\t")[1..-1] * ":"}
73
75
  tsv[name] = entities
74
76
  end
75
77
 
@@ -79,6 +81,21 @@ EOF
79
81
  end
80
82
  end
81
83
  end
84
+
85
+ def self.entities(texts)
86
+ res = {}
87
+ process(texts).each do |name, entities|
88
+
89
+ segments = entities.collect do |entity|
90
+ start, eend, literal, type, code = entity.split(":")
91
+ literal.gsub!('·',':')
92
+
93
+ NamedEntity.setup(literal, :offset => start.to_i, :entity_type => type, :code => code)
94
+ end
95
+
96
+ res[name] = segments
97
+ end
98
+ end
82
99
  end
83
100
 
84
101
  if __FILE__ == $0
@@ -1,12 +1,12 @@
1
1
  require 'rjb'
2
2
  require 'rbbt'
3
- require 'rbbt/text/segment/named_entity'
3
+ require 'rbbt/segment/named_entity'
4
4
 
5
5
  module Linnaeus
6
6
 
7
7
  Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
8
8
 
9
- ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
9
+ ARGS = ["--properties", Rbbt.software.opt.Linnaeus.produce["species-proxy/properties.conf"].find]
10
10
 
11
11
 
12
12
  Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx2G']) unless Rjb.loaded?
@@ -31,7 +31,7 @@ module Linnaeus
31
31
  init unless defined? @@Matcher
32
32
 
33
33
  @@Matcher.match(text).toArray().collect do |mention|
34
- NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
34
+ NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
35
35
  end
36
36
  end
37
37
  end
@@ -1,8 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/misc'
3
3
  require 'rbbt/tsv'
4
- require 'rbbt/text/segment'
5
- require 'rbbt/text/segment/token'
4
+ require 'rbbt/segment'
5
+ require 'rbbt/segment/token'
6
6
  require 'rbbt/ner/NER'
7
7
  require 'inline'
8
8
 
@@ -150,7 +150,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
150
150
 
151
151
  def match(text)
152
152
  matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
153
- NamedEntity.setup(name, offset, type, code)
153
+ NamedEntity.setup(name, :offset => offset, :entity_type => type, :code => code)
154
154
  }
155
155
 
156
156
  if case_insensitive
@@ -1,7 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/text/segment'
5
4
  require 'rbbt/ner/NER'
6
5
  require 'rbbt/util/log'
7
6
 
@@ -53,7 +52,7 @@ class OSCAR3 < NER
53
52
  next unless type.nil? or type.include? mention_type
54
53
  score = memm ? entities.get(key).to_string.to_f : nil
55
54
 
56
- NamedEntity.setup mention, rstart.to_i + offset, mention_type, nil, score
55
+ NamedEntity.setup mention, :offset => rstart.to_i + offset, :entity_type => mention_type, :score => score
57
56
 
58
57
  mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
59
58
  end
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
@@ -25,7 +25,7 @@ class OSCAR4 < NER
25
25
  @@tagger ||= @@OSCAR.new()
26
26
  end
27
27
 
28
- def self.match(text, type = nil)
28
+ def self.match(text, protect = false, type = nil)
29
29
  self.init
30
30
 
31
31
  return [] if text.nil? or text.strip.empty?
@@ -46,7 +46,7 @@ class OSCAR4 < NER
46
46
 
47
47
  next unless entity.getType.toString == type unless type.nil?
48
48
 
49
- NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
49
+ NamedEntity.setup mention, :offset => entity.getStart, :entity_type => entity.getType, :code => inchi, :score => entity.getConfidence
50
50
 
51
51
  result << mention
52
52
  end
@@ -1,7 +1,7 @@
1
- require 'rbbt/text/segment/named_entity'
2
- require 'rbbt/text/segment/segmented'
3
- require 'rbbt/text/segment/transformed'
4
- require 'rbbt/text/segment/relationship'
1
+ require 'rbbt/segment/named_entity'
2
+ require 'rbbt/segment/segmented'
3
+ require 'rbbt/segment/transformed'
4
+ #require 'rbbt/segment/relationship'
5
5
  require 'rbbt/ner/regexpNER'
6
6
  require 'rbbt/ner/token_trieNER'
7
7
  require 'rbbt/nlp/nlp'
@@ -14,7 +14,8 @@ class PatternRelExt
14
14
  regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
15
15
  segments = sentence.segments
16
16
  segments = segments.values.flatten if Hash === segments
17
- Transformed.with_transform(sentence, segments, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
17
+ Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
18
+ ppp sentence
18
19
  regexpNER.entities(sentence)
19
20
  end
20
21
  end
@@ -1,4 +1,3 @@
1
- require 'rbbt/text/segment'
2
1
  require 'rbbt/ner/NER'
3
2
  require 'rbbt/util/simpleDSL'
4
3
 
@@ -23,7 +22,7 @@ class RegExpNER < NER
23
22
  end
24
23
 
25
24
  if match and not match.empty?
26
- NamedEntity.setup(match, start + pre.length, type)
25
+ NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
27
26
  matches << match
28
27
  end
29
28
 
@@ -1,8 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/tsv'
3
- require 'rbbt/text/segment'
4
- require 'rbbt/text/segment/token'
3
+ require 'rbbt/segment'
5
4
  require 'rbbt/ner/NER'
5
+ require 'rbbt/segment/token'
6
6
 
7
7
  class TokenTrieNER < NER
8
8
  def self.clean(token)
@@ -16,13 +16,13 @@ class TokenTrieNER < NER
16
16
  def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
17
17
  if no_clean
18
18
  if extend_to_token
19
- Token.setup(clean(token), start, token)
19
+ Token.setup(token, :offset => start, :original => token)
20
20
  else
21
21
  token
22
22
  end
23
23
  else
24
24
  if extend_to_token
25
- Token.setup(clean(token), start, token)
25
+ Token.setup(clean(token), :offset => start, :original => token)
26
26
  else
27
27
  clean(token)
28
28
  end
@@ -137,7 +137,7 @@ class TokenTrieNER < NER
137
137
  tmp_index = {}
138
138
  hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
139
139
  names = Array === names ? names : [names]
140
- names.flatten! if Array === names.first and not Token === names.first.first
140
+ names.flatten! if Array === names.first and not Segment === names.first.first
141
141
 
142
142
  if names.empty?
143
143
  names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
@@ -237,7 +237,7 @@ class TokenTrieNER < NER
237
237
  match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
238
238
  }
239
239
 
240
- NamedEntity.setup(match, match_tokens.first.offset, type, codes)
240
+ NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
241
241
  end
242
242
 
243
243
  attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean