rbbt-text 1.2.0 → 1.3.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +55 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +63 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +26 -3
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -383
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -363
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -82
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '009cfce2ce954c03db5c09d0bd6f5d25bf59d508776d7370bb6bd0fb3a135f36'
4
- data.tar.gz: 3d11d2a5934512958d10dbdfad5e22a9a2481b332c985ab1e2c8e92427d6f375
3
+ metadata.gz: 496288d7d3ff1215ded1fd210192d5887a6a071eea5f322295a669a5d648d77b
4
+ data.tar.gz: 47996496009cbcdaab38a9dc9bf6efbbe7fc0145f315b0a48bfab0f543742f94
5
5
  SHA512:
6
- metadata.gz: e9338d4b54d2b66efda11dee3d37366c4f4ae78bde80f0abc1016b34c928e1db857ad73f33ba1da611ad232513498430736c46134a902b3930a8f832afed3e09
7
- data.tar.gz: 0cdeeee67636d4e0b0714334b3c187cb0f5ea5c7363fe27fc84d438643a0d6f204413a4dd5d99c8c43d847539320c484fde2b5300b298cf9cc782148d98802ee
6
+ metadata.gz: 36e7415ad06207066844a30001c8541865f066d1e83a4a2ddc5182c54b704cd3d442cbccce219bd2114717a83656d07558c42725eca75597fea239b6e13244ab
7
+ data.tar.gz: 988eff4d242d0425910b96fac4188df079c8c53c3abea2825cc97d5af5118841680705fa33461a5b4cfa7b8d6b32a486465e44b75f20fad324e4623c6c8083d8
@@ -69,6 +69,11 @@ module BagOfWords
69
69
  count = bigrams ? count(bigrams(text)) : count(words(text))
70
70
  count.values_at(*terms)
71
71
  end
72
+
73
+ def self.weighted_features(text, weights)
74
+ features = features(text, weights.keys)
75
+ features.zip(weights.values).collect{|f,w| f * w }
76
+ end
72
77
  end
73
78
 
74
79
  class String
@@ -82,5 +87,3 @@ class String
82
87
  BagOfWords.bigrams(self)
83
88
  end
84
89
  end
85
-
86
-
@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
74
74
  end
75
75
 
76
76
  def best(options = {})
77
- high, low, limit = {
78
- :low => 0,
79
- :high => 1,
80
- }.merge(options).
81
- values_at(:high, :low, :limit)
82
-
83
- num_docs = @num_docs.to_f
84
- best = df.select{|term, value|
85
- value >= low && value <= high
86
- }.collect{|p|
87
- term = p.first
88
- df_value = p.last
89
- [term,
90
- @terms[term].to_f / num_docs * Math::log(1.0/df_value)
91
- ]
92
- }
93
-
94
- if limit
95
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
96
- else
97
- Hash[*best.flatten]
98
- end
77
+ key = Misc.obj2digest(options)
78
+ @best ||= {}
79
+ @best[key] ||= begin
80
+ high, low, limit = {
81
+ :low => 0,
82
+ :high => 1,
83
+ }.merge(options).
84
+ values_at(:high, :low, :limit)
85
+
86
+ num_docs = @num_docs.to_f
87
+ best = df.select{|term, value|
88
+ value >= low && value <= high
89
+ }.collect{|p|
90
+ term = p.first
91
+ df_value = p.last
92
+ [term,
93
+ @terms[term].to_f / num_docs * Math::log(1.0/df_value)
94
+ ]
95
+ }
96
+
97
+ if limit
98
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
99
+ else
100
+ Hash[*best.flatten]
101
+ end
102
+ end
99
103
  end
100
104
 
101
105
  def weights(options = {})
@@ -173,7 +177,7 @@ class Dictionary::KL
173
177
  best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
174
178
  }
175
179
  if limit
176
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
180
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
177
181
  else
178
182
  best
179
183
  end
@@ -0,0 +1,55 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+
4
+ module DocID
5
+ extend Entity
6
+ self.annotation :corpus
7
+
8
+ class << self
9
+ attr_accessor :default_corpus
10
+ end
11
+
12
+ def corpus
13
+ annotation_values[:corpus] || DocID.default_corpus
14
+ end
15
+
16
+ property :to do |type|
17
+ namespace, code = self.split(":")
18
+ DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
19
+ end
20
+
21
+ property :document => :both do
22
+ if Array === self
23
+ namespace, id, type = nil, nil, nil
24
+ docs = self.collect do |docid|
25
+ text = self.corpus[docid]
26
+ namespace, id, type = docid.split(":")
27
+ text
28
+ end
29
+ Document.setup(docs, :corpus => corpus)
30
+ else
31
+ text = self.corpus[self]
32
+ namespace, id, type = self.split(":")
33
+ Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
34
+ end
35
+ end
36
+ end
37
+
38
+ module Document
39
+ extend Entity
40
+ self.annotation :namespace, :code, :type, :corpus
41
+
42
+ property :docid do |corpus=nil|
43
+ digest = Misc.digest(self)
44
+ corpus = self.corpus if corpus.nil?
45
+
46
+ DocID.setup([namespace, code, type, digest] * ":", :corpus => corpus)
47
+ end
48
+
49
+ property :to do |type|
50
+ docid.to(type).document
51
+ end
52
+
53
+ alias id docid
54
+ end
55
+
@@ -0,0 +1,45 @@
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/annotation'
3
+
4
+ module Document
5
+ def self.define(type, &block)
6
+ send :property, type do
7
+ segments = self.instance_exec &block
8
+
9
+ Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
10
+
11
+ segments.each do |segment|
12
+ SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
13
+ end
14
+
15
+ docid = self.docid
16
+ segments.each{|s| s.docid = docid if s.docid.nil? }
17
+
18
+ segments
19
+ end
20
+ end
21
+
22
+ def self.define_multiple(type, &block)
23
+ send :property, type => :multiple do |list|
24
+ doc_segments = self.instance_exec list, &block
25
+
26
+ doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
27
+
28
+ doc_segments.each_with_index do |segments,i|
29
+ next if segments.nil?
30
+ document = list[i]
31
+ Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
32
+
33
+ segments.each do |segment|
34
+ SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
35
+ end
36
+
37
+ docid = document.docid
38
+
39
+ segments.each{|s| s.docid = docid if s.docid.nil? }
40
+
41
+ segments
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,63 @@
1
+ require 'rbbt-util'
2
+
3
+ module Document::Corpus
4
+
5
+ def self.setup(corpus)
6
+ corpus.extend Document::Corpus unless Document::Corpus === corpus
7
+ corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
8
+ corpus
9
+ end
10
+
11
+ def add_document(document)
12
+ docid = document.docid
13
+ return self[docid] if self.include?(docid)
14
+ self.write_and_close do
15
+ self[docid] = document
16
+ end
17
+ end
18
+
19
+ def docids(prefix)
20
+ prefix += ":" unless prefix == :all || prefix[-1] == ":"
21
+ docids = self.read_and_close do
22
+ prefix == :all ? self.keys : self.prefix(prefix)
23
+ end
24
+ DocID.setup(docids, :corpus => self)
25
+ end
26
+
27
+ def documents(prefix)
28
+ self.docids(prefix).document
29
+ end
30
+
31
+ def [](*args)
32
+ docid, *rest = args
33
+
34
+ res = self.read_and_close do
35
+ super(*args)
36
+ end
37
+
38
+ res.force_encoding(Encoding.default_external) if res
39
+ return res if args.length > 1
40
+
41
+ namespace, id, type = docid.split(":")
42
+
43
+ if res.nil?
44
+ if Document::Corpus.claims.include?(namespace.to_s)
45
+ res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
46
+ end
47
+ end
48
+
49
+ res.force_encoding(Encoding.default_external) if res
50
+ Document.setup(res, namespace, id, type, self) unless res.nil?
51
+
52
+ res
53
+ end
54
+
55
+ class << self
56
+ attr_accessor :claims
57
+ def claim(namespace, &block)
58
+ @claims = {}
59
+ @claims[namespace.to_s] = block
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,33 @@
1
+ require 'rbbt/sources/pubmed'
2
+
3
+ module Document::Corpus
4
+ def add_pmid(pmid, type = nil)
5
+ pmids = Array === pmid ? pmid : [pmid]
6
+ type = nil if String === type and type.empty?
7
+
8
+ res = PubMed.get_article(pmids).collect do |pmid, article|
9
+ document = if type.nil? || type.to_sym == :abstract
10
+ Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
11
+ elsif type.to_sym == :title
12
+ Document.setup(article.title, :PMID, pmid, :title, self)
13
+ else
14
+ raise "No FullText available for #{ pmid }" if article.full_text.nil?
15
+ Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
16
+ end
17
+ Log.debug "Loading pmid #{pmid}"
18
+ add_document(document)
19
+ end
20
+
21
+ Document.setup(res)
22
+ end
23
+
24
+ def add_pubmed_query(query, max = 3000, type = nil)
25
+ pmids = PubMed.query(query, max)
26
+ add_pmid(pmids, type)
27
+ end
28
+
29
+ self.claim "PMID" do |id, type|
30
+ Log.debug "Claiming #{id}"
31
+ self.add_pmid(id, type).first
32
+ end
33
+ end
@@ -1,6 +1,6 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/text/segment/named_entity'
3
- require 'rbbt/text/segment/segmented'
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/named_entity'
3
+ require 'rbbt/segment/segmented'
4
4
 
5
5
  class NER
6
6
  def entities(text, protect = false, *args)
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'rbbt/resource'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'rbbt/ner/NER'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
 
6
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
@@ -1,4 +1,4 @@
1
- require 'rbbt/text/segment/named_entity'
1
+ require 'rbbt/segment/named_entity'
2
2
  require 'rbbt/text/segment/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/text/segment'
4
3
  require 'rbbt/ner/NER'
5
4
  require 'rbbt/util/log'
6
5
 
@@ -8,7 +7,7 @@ class ChemicalTagger < NER
8
7
  Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
9
8
 
10
9
  def self.init
11
- ENV["CLASSPATH"] = ENV["CLASSPATH"].split(":").reverse * ":"
10
+ ENV["CLASSPATH"] = [ENV["CLASSPATH"].split(":"), Rbbt.software.opt.ChemicalTagger.produce.glob("*.jar").first].reverse * ":"
12
11
  Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
13
12
  @@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
14
13
  end
@@ -1,4 +1,6 @@
1
1
  require 'rbbt-util'
2
+ require 'rbbt/segment'
3
+ require 'rbbt/segment/named_entity'
2
4
  module GNormPlus
3
5
 
4
6
  Rbbt.claim Rbbt.software.opt.GNormPlus, :install do
@@ -35,8 +37,8 @@ module GNormPlus
35
37
  HomologeneID = False
36
38
  Normalization2Protein = False
37
39
  ShowUnNormalizedMention = False
40
+ IgnoreNER = False
38
41
  DeleteTmp = True
39
- IgnoreNER = True
40
42
  EOF
41
43
 
42
44
  def self.process(texts)
@@ -53,11 +55,16 @@ EOF
53
55
  Open.mkdir 'tmp'
54
56
 
55
57
  texts.each do |name,text|
58
+ text = Misc.fixutf8(text)
59
+
60
+ text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
61
+
56
62
  Open.write("input/#{name}.txt") do |f|
57
- f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
63
+ f.puts "#{name}|a|" << text
58
64
  f.puts
59
65
  end
60
66
  end
67
+
61
68
  Open.write('config', CONFIG)
62
69
  CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
63
70
 
@@ -69,7 +76,7 @@ EOF
69
76
  tsv = TSV.setup({}, :key_field => key_field, :fields => ["Entities"], :type => :flat)
70
77
  Dir.glob("output/*.txt").each do |file|
71
78
  name = File.basename(file).sub(".txt",'')
72
- entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
79
+ entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '·').split("\t")[1..-1] * ":"}
73
80
  tsv[name] = entities
74
81
  end
75
82
 
@@ -79,6 +86,22 @@ EOF
79
86
  end
80
87
  end
81
88
  end
89
+
90
+ def self.entities(texts)
91
+ res = {}
92
+ process(texts).each do |name, entities|
93
+
94
+ segments = entities.collect do |entity|
95
+ start, eend, literal, type, code = entity.split(":")
96
+ literal.gsub!('·',':')
97
+
98
+ NamedEntity.setup(literal, :offset => start.to_i, :entity_type => type, :code => code)
99
+ end
100
+
101
+ res[name] = segments
102
+ end
103
+ res
104
+ end
82
105
  end
83
106
 
84
107
  if __FILE__ == $0
@@ -1,12 +1,12 @@
1
1
  require 'rjb'
2
2
  require 'rbbt'
3
- require 'rbbt/text/segment/named_entity'
3
+ require 'rbbt/segment/named_entity'
4
4
 
5
5
  module Linnaeus
6
6
 
7
7
  Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
8
8
 
9
- ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
9
+ ARGS = ["--properties", Rbbt.software.opt.Linnaeus.produce["species-proxy/properties.conf"].find]
10
10
 
11
11
 
12
12
  Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx2G']) unless Rjb.loaded?
@@ -31,7 +31,7 @@ module Linnaeus
31
31
  init unless defined? @@Matcher
32
32
 
33
33
  @@Matcher.match(text).toArray().collect do |mention|
34
- NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
34
+ NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
35
35
  end
36
36
  end
37
37
  end
@@ -1,8 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/misc'
3
3
  require 'rbbt/tsv'
4
- require 'rbbt/text/segment'
5
- require 'rbbt/text/segment/token'
4
+ require 'rbbt/segment'
5
+ require 'rbbt/segment/token'
6
6
  require 'rbbt/ner/NER'
7
7
  require 'inline'
8
8
 
@@ -150,7 +150,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
150
150
 
151
151
  def match(text)
152
152
  matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
153
- NamedEntity.setup(name, offset, type, code)
153
+ NamedEntity.setup(name, :offset => offset, :entity_type => type, :code => code)
154
154
  }
155
155
 
156
156
  if case_insensitive