rbbt-text 1.2.0 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +55 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +63 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +26 -3
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -383
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -363
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -82
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '009cfce2ce954c03db5c09d0bd6f5d25bf59d508776d7370bb6bd0fb3a135f36'
4
- data.tar.gz: 3d11d2a5934512958d10dbdfad5e22a9a2481b332c985ab1e2c8e92427d6f375
3
+ metadata.gz: 496288d7d3ff1215ded1fd210192d5887a6a071eea5f322295a669a5d648d77b
4
+ data.tar.gz: 47996496009cbcdaab38a9dc9bf6efbbe7fc0145f315b0a48bfab0f543742f94
5
5
  SHA512:
6
- metadata.gz: e9338d4b54d2b66efda11dee3d37366c4f4ae78bde80f0abc1016b34c928e1db857ad73f33ba1da611ad232513498430736c46134a902b3930a8f832afed3e09
7
- data.tar.gz: 0cdeeee67636d4e0b0714334b3c187cb0f5ea5c7363fe27fc84d438643a0d6f204413a4dd5d99c8c43d847539320c484fde2b5300b298cf9cc782148d98802ee
6
+ metadata.gz: 36e7415ad06207066844a30001c8541865f066d1e83a4a2ddc5182c54b704cd3d442cbccce219bd2114717a83656d07558c42725eca75597fea239b6e13244ab
7
+ data.tar.gz: 988eff4d242d0425910b96fac4188df079c8c53c3abea2825cc97d5af5118841680705fa33461a5b4cfa7b8d6b32a486465e44b75f20fad324e4623c6c8083d8
@@ -69,6 +69,11 @@ module BagOfWords
69
69
  count = bigrams ? count(bigrams(text)) : count(words(text))
70
70
  count.values_at(*terms)
71
71
  end
72
+
73
+ def self.weighted_features(text, weights)
74
+ features = features(text, weights.keys)
75
+ features.zip(weights.values).collect{|f,w| f * w }
76
+ end
72
77
  end
73
78
 
74
79
  class String
@@ -82,5 +87,3 @@ class String
82
87
  BagOfWords.bigrams(self)
83
88
  end
84
89
  end
85
-
86
-
@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
74
74
  end
75
75
 
76
76
  def best(options = {})
77
- high, low, limit = {
78
- :low => 0,
79
- :high => 1,
80
- }.merge(options).
81
- values_at(:high, :low, :limit)
82
-
83
- num_docs = @num_docs.to_f
84
- best = df.select{|term, value|
85
- value >= low && value <= high
86
- }.collect{|p|
87
- term = p.first
88
- df_value = p.last
89
- [term,
90
- @terms[term].to_f / num_docs * Math::log(1.0/df_value)
91
- ]
92
- }
93
-
94
- if limit
95
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
96
- else
97
- Hash[*best.flatten]
98
- end
77
+ key = Misc.obj2digest(options)
78
+ @best ||= {}
79
+ @best[key] ||= begin
80
+ high, low, limit = {
81
+ :low => 0,
82
+ :high => 1,
83
+ }.merge(options).
84
+ values_at(:high, :low, :limit)
85
+
86
+ num_docs = @num_docs.to_f
87
+ best = df.select{|term, value|
88
+ value >= low && value <= high
89
+ }.collect{|p|
90
+ term = p.first
91
+ df_value = p.last
92
+ [term,
93
+ @terms[term].to_f / num_docs * Math::log(1.0/df_value)
94
+ ]
95
+ }
96
+
97
+ if limit
98
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
99
+ else
100
+ Hash[*best.flatten]
101
+ end
102
+ end
99
103
  end
100
104
 
101
105
  def weights(options = {})
@@ -173,7 +177,7 @@ class Dictionary::KL
173
177
  best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
174
178
  }
175
179
  if limit
176
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
180
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
177
181
  else
178
182
  best
179
183
  end
@@ -0,0 +1,55 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+
4
+ module DocID
5
+ extend Entity
6
+ self.annotation :corpus
7
+
8
+ class << self
9
+ attr_accessor :default_corpus
10
+ end
11
+
12
+ def corpus
13
+ annotation_values[:corpus] || DocID.default_corpus
14
+ end
15
+
16
+ property :to do |type|
17
+ namespace, code = self.split(":")
18
+ DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
19
+ end
20
+
21
+ property :document => :both do
22
+ if Array === self
23
+ namespace, id, type = nil, nil, nil
24
+ docs = self.collect do |docid|
25
+ text = self.corpus[docid]
26
+ namespace, id, type = docid.split(":")
27
+ text
28
+ end
29
+ Document.setup(docs, :corpus => corpus)
30
+ else
31
+ text = self.corpus[self]
32
+ namespace, id, type = self.split(":")
33
+ Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
34
+ end
35
+ end
36
+ end
37
+
38
+ module Document
39
+ extend Entity
40
+ self.annotation :namespace, :code, :type, :corpus
41
+
42
+ property :docid do |corpus=nil|
43
+ digest = Misc.digest(self)
44
+ corpus = self.corpus if corpus.nil?
45
+
46
+ DocID.setup([namespace, code, type, digest] * ":", :corpus => corpus)
47
+ end
48
+
49
+ property :to do |type|
50
+ docid.to(type).document
51
+ end
52
+
53
+ alias id docid
54
+ end
55
+
@@ -0,0 +1,45 @@
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/annotation'
3
+
4
+ module Document
5
+ def self.define(type, &block)
6
+ send :property, type do
7
+ segments = self.instance_exec &block
8
+
9
+ Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
10
+
11
+ segments.each do |segment|
12
+ SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
13
+ end
14
+
15
+ docid = self.docid
16
+ segments.each{|s| s.docid = docid if s.docid.nil? }
17
+
18
+ segments
19
+ end
20
+ end
21
+
22
+ def self.define_multiple(type, &block)
23
+ send :property, type => :multiple do |list|
24
+ doc_segments = self.instance_exec list, &block
25
+
26
+ doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
27
+
28
+ doc_segments.each_with_index do |segments,i|
29
+ next if segments.nil?
30
+ document = list[i]
31
+ Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
32
+
33
+ segments.each do |segment|
34
+ SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
35
+ end
36
+
37
+ docid = document.docid
38
+
39
+ segments.each{|s| s.docid = docid if s.docid.nil? }
40
+
41
+ segments
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,63 @@
1
+ require 'rbbt-util'
2
+
3
+ module Document::Corpus
4
+
5
+ def self.setup(corpus)
6
+ corpus.extend Document::Corpus unless Document::Corpus === corpus
7
+ corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
8
+ corpus
9
+ end
10
+
11
+ def add_document(document)
12
+ docid = document.docid
13
+ return self[docid] if self.include?(docid)
14
+ self.write_and_close do
15
+ self[docid] = document
16
+ end
17
+ end
18
+
19
+ def docids(prefix)
20
+ prefix += ":" unless prefix == :all || prefix[-1] == ":"
21
+ docids = self.read_and_close do
22
+ prefix == :all ? self.keys : self.prefix(prefix)
23
+ end
24
+ DocID.setup(docids, :corpus => self)
25
+ end
26
+
27
+ def documents(prefix)
28
+ self.docids(prefix).document
29
+ end
30
+
31
+ def [](*args)
32
+ docid, *rest = args
33
+
34
+ res = self.read_and_close do
35
+ super(*args)
36
+ end
37
+
38
+ res.force_encoding(Encoding.default_external) if res
39
+ return res if args.length > 1
40
+
41
+ namespace, id, type = docid.split(":")
42
+
43
+ if res.nil?
44
+ if Document::Corpus.claims.include?(namespace.to_s)
45
+ res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
46
+ end
47
+ end
48
+
49
+ res.force_encoding(Encoding.default_external) if res
50
+ Document.setup(res, namespace, id, type, self) unless res.nil?
51
+
52
+ res
53
+ end
54
+
55
+ class << self
56
+ attr_accessor :claims
57
+ def claim(namespace, &block)
58
+ @claims = {}
59
+ @claims[namespace.to_s] = block
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,33 @@
1
+ require 'rbbt/sources/pubmed'
2
+
3
+ module Document::Corpus
4
+ def add_pmid(pmid, type = nil)
5
+ pmids = Array === pmid ? pmid : [pmid]
6
+ type = nil if String === type and type.empty?
7
+
8
+ res = PubMed.get_article(pmids).collect do |pmid, article|
9
+ document = if type.nil? || type.to_sym == :abstract
10
+ Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
11
+ elsif type.to_sym == :title
12
+ Document.setup(article.title, :PMID, pmid, :title, self)
13
+ else
14
+ raise "No FullText available for #{ pmid }" if article.full_text.nil?
15
+ Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
16
+ end
17
+ Log.debug "Loading pmid #{pmid}"
18
+ add_document(document)
19
+ end
20
+
21
+ Document.setup(res)
22
+ end
23
+
24
+ def add_pubmed_query(query, max = 3000, type = nil)
25
+ pmids = PubMed.query(query, max)
26
+ add_pmid(pmids, type)
27
+ end
28
+
29
+ self.claim "PMID" do |id, type|
30
+ Log.debug "Claiming #{id}"
31
+ self.add_pmid(id, type).first
32
+ end
33
+ end
@@ -1,6 +1,6 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/text/segment/named_entity'
3
- require 'rbbt/text/segment/segmented'
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/named_entity'
3
+ require 'rbbt/segment/segmented'
4
4
 
5
5
  class NER
6
6
  def entities(text, protect = false, *args)
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'rbbt/resource'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'rbbt/ner/NER'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
 
6
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
@@ -1,4 +1,4 @@
1
- require 'rbbt/text/segment/named_entity'
1
+ require 'rbbt/segment/named_entity'
2
2
  require 'rbbt/text/segment/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/text/segment'
4
3
  require 'rbbt/ner/NER'
5
4
  require 'rbbt/util/log'
6
5
 
@@ -8,7 +7,7 @@ class ChemicalTagger < NER
8
7
  Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
9
8
 
10
9
  def self.init
11
- ENV["CLASSPATH"] = ENV["CLASSPATH"].split(":").reverse * ":"
10
+ ENV["CLASSPATH"] = [ENV["CLASSPATH"].split(":"), Rbbt.software.opt.ChemicalTagger.produce.glob("*.jar").first].reverse * ":"
12
11
  Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
13
12
  @@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
14
13
  end
@@ -1,4 +1,6 @@
1
1
  require 'rbbt-util'
2
+ require 'rbbt/segment'
3
+ require 'rbbt/segment/named_entity'
2
4
  module GNormPlus
3
5
 
4
6
  Rbbt.claim Rbbt.software.opt.GNormPlus, :install do
@@ -35,8 +37,8 @@ module GNormPlus
35
37
  HomologeneID = False
36
38
  Normalization2Protein = False
37
39
  ShowUnNormalizedMention = False
40
+ IgnoreNER = False
38
41
  DeleteTmp = True
39
- IgnoreNER = True
40
42
  EOF
41
43
 
42
44
  def self.process(texts)
@@ -53,11 +55,16 @@ EOF
53
55
  Open.mkdir 'tmp'
54
56
 
55
57
  texts.each do |name,text|
58
+ text = Misc.fixutf8(text)
59
+
60
+ text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
61
+
56
62
  Open.write("input/#{name}.txt") do |f|
57
- f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
63
+ f.puts "#{name}|a|" << text
58
64
  f.puts
59
65
  end
60
66
  end
67
+
61
68
  Open.write('config', CONFIG)
62
69
  CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
63
70
 
@@ -69,7 +76,7 @@ EOF
69
76
  tsv = TSV.setup({}, :key_field => key_field, :fields => ["Entities"], :type => :flat)
70
77
  Dir.glob("output/*.txt").each do |file|
71
78
  name = File.basename(file).sub(".txt",'')
72
- entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
79
+ entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '·').split("\t")[1..-1] * ":"}
73
80
  tsv[name] = entities
74
81
  end
75
82
 
@@ -79,6 +86,22 @@ EOF
79
86
  end
80
87
  end
81
88
  end
89
+
90
+ def self.entities(texts)
91
+ res = {}
92
+ process(texts).each do |name, entities|
93
+
94
+ segments = entities.collect do |entity|
95
+ start, eend, literal, type, code = entity.split(":")
96
+ literal.gsub!('·',':')
97
+
98
+ NamedEntity.setup(literal, :offset => start.to_i, :entity_type => type, :code => code)
99
+ end
100
+
101
+ res[name] = segments
102
+ end
103
+ res
104
+ end
82
105
  end
83
106
 
84
107
  if __FILE__ == $0
@@ -1,12 +1,12 @@
1
1
  require 'rjb'
2
2
  require 'rbbt'
3
- require 'rbbt/text/segment/named_entity'
3
+ require 'rbbt/segment/named_entity'
4
4
 
5
5
  module Linnaeus
6
6
 
7
7
  Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
8
8
 
9
- ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
9
+ ARGS = ["--properties", Rbbt.software.opt.Linnaeus.produce["species-proxy/properties.conf"].find]
10
10
 
11
11
 
12
12
  Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx2G']) unless Rjb.loaded?
@@ -31,7 +31,7 @@ module Linnaeus
31
31
  init unless defined? @@Matcher
32
32
 
33
33
  @@Matcher.match(text).toArray().collect do |mention|
34
- NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
34
+ NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
35
35
  end
36
36
  end
37
37
  end
@@ -1,8 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/misc'
3
3
  require 'rbbt/tsv'
4
- require 'rbbt/text/segment'
5
- require 'rbbt/text/segment/token'
4
+ require 'rbbt/segment'
5
+ require 'rbbt/segment/token'
6
6
  require 'rbbt/ner/NER'
7
7
  require 'inline'
8
8
 
@@ -150,7 +150,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
150
150
 
151
151
  def match(text)
152
152
  matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
153
- NamedEntity.setup(name, offset, type, code)
153
+ NamedEntity.setup(name, :offset => offset, :entity_type => type, :code => code)
154
154
  }
155
155
 
156
156
  if case_insensitive