rbbt-text 1.1.9 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +56 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +61 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +42 -12
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -361
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -355
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -52
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 77391b4691e4ea2a6e5da918bc40820bae8175ff1d82f9c96a1685986605dfd7
4
- data.tar.gz: a83dd9236502d1787f1040fb4c60a6160086515713282283e434b589c1425743
3
+ metadata.gz: d25c6d473e1ee0a8ba79af357571181539b6e18e6b8d11e85fcca037069be3bf
4
+ data.tar.gz: dbc3621f7fbc0ab5569b9f98a527c20cbc4192c6db211504a904364452518caf
5
5
  SHA512:
6
- metadata.gz: f69d7eb10741d2b3c7735e8e29f29625567775647d16d0261b42cce108d2f8309a2e938dad3360842a964a9c5d4fd5a2197c72618ab40971f7a65306e9c6936a
7
- data.tar.gz: dec802a15cfc7c8c9a90ee8ec0c83af88c881ee16e071776a995554aa0661603bdd6cb7bf30162c43beccf1a423a2e8d26afc15f92544ccc08284a87a038a1b2
6
+ metadata.gz: 7a6568d91518fa0c4aedd748fe2b7c2db745a2997efb03c00993ebc24f6682422d209aa266912bcaca32c2033b6babbf9b14db2bf39973b4a33a69fa9ed07eca
7
+ data.tar.gz: 2c87eeccbb22e90c87611024429918e4a3fbdcf8212d6b6538e063d6e0116a457a6c855bafad4ac4621bfb7ae91ff18d2cea0cbcf56dfdff79c2ec88666cbf18
@@ -69,6 +69,11 @@ module BagOfWords
69
69
  count = bigrams ? count(bigrams(text)) : count(words(text))
70
70
  count.values_at(*terms)
71
71
  end
72
+
73
+ def self.weighted_features(text, weights)
74
+ features = features(text, weights.keys)
75
+ features.zip(weights.values).collect{|f,w| f * w }
76
+ end
72
77
  end
73
78
 
74
79
  class String
@@ -82,5 +87,3 @@ class String
82
87
  BagOfWords.bigrams(self)
83
88
  end
84
89
  end
85
-
86
-
@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
74
74
  end
75
75
 
76
76
  def best(options = {})
77
- high, low, limit = {
78
- :low => 0,
79
- :high => 1,
80
- }.merge(options).
81
- values_at(:high, :low, :limit)
82
-
83
- num_docs = @num_docs.to_f
84
- best = df.select{|term, value|
85
- value >= low && value <= high
86
- }.collect{|p|
87
- term = p.first
88
- df_value = p.last
89
- [term,
90
- @terms[term].to_f / num_docs * Math::log(1.0/df_value)
91
- ]
92
- }
93
-
94
- if limit
95
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
96
- else
97
- Hash[*best.flatten]
98
- end
77
+ key = Misc.obj2digest(options)
78
+ @best ||= {}
79
+ @best[key] ||= begin
80
+ high, low, limit = {
81
+ :low => 0,
82
+ :high => 1,
83
+ }.merge(options).
84
+ values_at(:high, :low, :limit)
85
+
86
+ num_docs = @num_docs.to_f
87
+ best = df.select{|term, value|
88
+ value >= low && value <= high
89
+ }.collect{|p|
90
+ term = p.first
91
+ df_value = p.last
92
+ [term,
93
+ @terms[term].to_f / num_docs * Math::log(1.0/df_value)
94
+ ]
95
+ }
96
+
97
+ if limit
98
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
99
+ else
100
+ Hash[*best.flatten]
101
+ end
102
+ end
99
103
  end
100
104
 
101
105
  def weights(options = {})
@@ -173,7 +177,7 @@ class Dictionary::KL
173
177
  best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
174
178
  }
175
179
  if limit
176
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
180
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
177
181
  else
178
182
  best
179
183
  end
@@ -0,0 +1,56 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/entity'
3
+
4
+ module DocID
5
+ extend Entity
6
+ self.annotation :corpus
7
+
8
+ class << self
9
+ attr_accessor :default_corpus
10
+ end
11
+
12
+ def corpus
13
+ annotation_values[:corpus] || DocID.default_corpus
14
+ end
15
+
16
+ property :to do |type|
17
+ namespace, code = self.split(":")
18
+ DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
19
+ end
20
+
21
+ property :document => :both do
22
+ if Array === self
23
+ namespace, id, type = nil, nil, nil
24
+ docs = self.collect do |docid|
25
+ text = self.corpus[docid]
26
+ namespace, id, type = docid.split(":")
27
+ #Document.setup(text, namespace, id, type, :corpus => corpus)
28
+ text
29
+ end
30
+ Document.setup(docs, :corpus => corpus)
31
+ else
32
+ text = self.corpus[self]
33
+ namespace, id, type = self.split(":")
34
+ Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
35
+ end
36
+ end
37
+ end
38
+
39
+ module Document
40
+ extend Entity
41
+ self.annotation :namespace, :code, :type, :corpus
42
+
43
+ property :docid do |corpus=nil|
44
+ digest = Misc.digest(self)
45
+ corpus = self.corpus if corpus.nil?
46
+
47
+ DocID.setup([namespace, code, type, digest] * ":", :corpus => corpus)
48
+ end
49
+
50
+ property :to do |type|
51
+ docid.to(type).document
52
+ end
53
+
54
+ alias id docid
55
+ end
56
+
@@ -0,0 +1,45 @@
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/annotation'
3
+
4
+ module Document
5
+ def self.define(type, &block)
6
+ send :property, type do
7
+ segments = self.instance_exec &block
8
+
9
+ Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
10
+
11
+ segments.each do |segment|
12
+ SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
13
+ end
14
+
15
+ docid = self.docid
16
+ segments.each{|s| s.docid = docid if s.docid.nil? }
17
+
18
+ segments
19
+ end
20
+ end
21
+
22
+ def self.define_multiple(type, &block)
23
+ send :property, type => :multiple do |list|
24
+ doc_segments = self.instance_exec list, &block
25
+
26
+ doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
27
+
28
+ doc_segments.each_with_index do |segments,i|
29
+ next if segments.nil?
30
+ document = list[i]
31
+ Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
32
+
33
+ segments.each do |segment|
34
+ SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
35
+ end
36
+
37
+ docid = document.docid
38
+
39
+ segments.each{|s| s.docid = docid if s.docid.nil? }
40
+
41
+ segments
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,61 @@
1
+ require 'rbbt-util'
2
+
3
+ module Document::Corpus
4
+
5
+ def self.setup(corpus)
6
+ corpus.extend Document::Corpus unless Document::Corpus === corpus
7
+ corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
8
+ corpus
9
+ end
10
+
11
+ def add_document(document)
12
+ docid = document.docid
13
+ return document if self.include?(docid)
14
+ self.write_and_close do
15
+ self[docid] = document
16
+ end
17
+ end
18
+
19
+ def docids(prefix)
20
+ prefix += ":" unless prefix[-1] == ":"
21
+ docids = self.read_and_close do
22
+ self.prefix(prefix)
23
+ end
24
+ DocID.setup(docids, :corpus => self)
25
+ end
26
+
27
+ def documents(prefix)
28
+ self.docids(prefix).document
29
+ end
30
+
31
+ def [](*args)
32
+ docid, *rest = args
33
+
34
+ res = self.read_and_close do
35
+ super(*args)
36
+ end
37
+
38
+ return res if args.length > 1
39
+
40
+ namespace, id, type = docid.split(":")
41
+
42
+ if res.nil?
43
+ if Document::Corpus.claims.include?(namespace.to_s)
44
+ res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
45
+ end
46
+ end
47
+
48
+ Document.setup(res, namespace, id, type, self) unless res.nil?
49
+
50
+ res
51
+ end
52
+
53
+ class << self
54
+ attr_accessor :claims
55
+ def claim(namespace, &block)
56
+ @claims = {}
57
+ @claims[namespace.to_s] = block
58
+ end
59
+ end
60
+
61
+ end
@@ -0,0 +1,33 @@
1
+ require 'rbbt/sources/pubmed'
2
+
3
+ module Document::Corpus
4
+ def add_pmid(pmid, type = nil)
5
+ pmids = Array === pmid ? pmid : [pmid]
6
+ type = nil if String === type and type.empty?
7
+
8
+ res = PubMed.get_article(pmids).collect do |pmid, article|
9
+ document = if type.nil? || type.to_sym == :abstract
10
+ Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
11
+ elsif type.to_sym == :title
12
+ Document.setup(article.title, :PMID, pmid, :title, self)
13
+ else
14
+ raise "No FullText available for #{ pmid }" if article.full_text.nil?
15
+ Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
16
+ end
17
+ Log.debug "Loading pmid #{pmid}"
18
+ add_document(document)
19
+ end
20
+
21
+ Document.setup(res)
22
+ end
23
+
24
+ def add_pubmed_query(query, max = 3000, type = nil)
25
+ pmids = PubMed.query(query, max)
26
+ add_pmid(pmids, type)
27
+ end
28
+
29
+ self.claim "PMID" do |id, type|
30
+ Log.debug "Claiming #{id}"
31
+ self.add_pmid(id, type).first
32
+ end
33
+ end
@@ -1,6 +1,6 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/text/segment/named_entity'
3
- require 'rbbt/text/segment/segmented'
1
+ require 'rbbt/segment'
2
+ require 'rbbt/segment/named_entity'
3
+ require 'rbbt/segment/segmented'
4
4
 
5
5
  class NER
6
6
  def entities(text, protect = false, *args)
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'rbbt/resource'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'rbbt/ner/NER'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
 
6
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
@@ -1,4 +1,4 @@
1
- require 'rbbt/text/segment/named_entity'
1
+ require 'rbbt/segment/named_entity'
2
2
  require 'rbbt/text/segment/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/text/segment'
4
3
  require 'rbbt/ner/NER'
5
4
  require 'rbbt/util/log'
6
5
 
@@ -8,7 +7,7 @@ class ChemicalTagger < NER
8
7
  Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
9
8
 
10
9
  def self.init
11
- ENV["CLASSPATH"] = ENV["CLASSPATH"].split(":").reverse * ":"
10
+ ENV["CLASSPATH"] = [ENV["CLASSPATH"].split(":"), Rbbt.software.opt.ChemicalTagger.produce.glob("*.jar").first].reverse * ":"
12
11
  Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
13
12
  @@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
14
13
  end
@@ -1,4 +1,6 @@
1
1
  require 'rbbt-util'
2
+ require 'rbbt/segment'
3
+ require 'rbbt/segment/named_entity'
2
4
  module GNormPlus
3
5
 
4
6
  Rbbt.claim Rbbt.software.opt.GNormPlus, :install do
@@ -10,35 +12,39 @@ module GNormPlus
10
12
  end
11
13
 
12
14
  CONFIG =<<-EOF
13
-
14
15
  #===Annotation
15
16
  #Attribution setting:
16
17
  #FocusSpecies = Taxonomy ID
17
- # All: All species
18
- # 9606: Human
19
- # 4932: yeast
20
- # 7227: Fly
21
- # 10090: Mouse
22
- # 10116: Rat
23
- # 7955: Zebrafish
24
- # 3702: Arabidopsis thaliana
18
+ # All: All species
19
+ # 9606: Human
20
+ # 4932: yeast
21
+ # 7227: Fly
22
+ # 10090: Mouse
23
+ # 10116: Rat
24
+ # 7955: Zebrafish
25
+ # 3702: Arabidopsis thaliana
25
26
  #open: True
26
27
  #close: False
27
28
 
28
29
  [Focus Species]
29
- FocusSpecies = All
30
+ FocusSpecies = 9606
31
+ FilterAntibody = False
30
32
  [Dictionary & Model]
31
33
  DictionaryFolder = ./Dictionary
32
34
  GNRModel = ./Dictionary/GNR.Model
33
35
  SCModel = ./Dictionary/SimConcept.Model
34
36
  GeneIDMatch = True
37
+ HomologeneID = False
35
38
  Normalization2Protein = False
39
+ ShowUnNormalizedMention = False
40
+ IgnoreNER = False
36
41
  DeleteTmp = True
37
42
  EOF
38
43
 
39
44
  def self.process(texts)
40
45
  TmpFile.with_file do |tmpdir|
41
46
  Open.mkdir tmpdir
47
+
42
48
  Misc.in_dir tmpdir do
43
49
  Open.ln_s Rbbt.software.opt.GNormPlus.Dictionary.find, '.'
44
50
  Open.ln_s Rbbt.software.opt.GNormPlus["BioC.dtd"].find, '.'
@@ -49,13 +55,18 @@ EOF
49
55
  Open.mkdir 'tmp'
50
56
 
51
57
  texts.each do |name,text|
58
+ text = Misc.fixutf8(text)
59
+
60
+ text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
61
+
52
62
  Open.write("input/#{name}.txt") do |f|
53
63
  f.puts "#{name}|a|" << text
54
64
  f.puts
55
65
  end
56
66
  end
67
+
57
68
  Open.write('config', CONFIG)
58
- CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.find}/GNormPlus.jar' 'input' 'output' 'config'")
69
+ CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
59
70
 
60
71
  if texts.respond_to? :key_field
61
72
  key_field = texts.key_field
@@ -65,13 +76,32 @@ EOF
65
76
  tsv = TSV.setup({}, :key_field => key_field, :fields => ["Entities"], :type => :flat)
66
77
  Dir.glob("output/*.txt").each do |file|
67
78
  name = File.basename(file).sub(".txt",'')
68
- entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
79
+ entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '·').split("\t")[1..-1] * ":"}
69
80
  tsv[name] = entities
70
81
  end
82
+
83
+ raise "GNormPlus failed: no results found" if tsv.size == 0 && texts.size > 0
84
+
71
85
  tsv
72
86
  end
73
87
  end
74
88
  end
89
+
90
+ def self.entities(texts)
91
+ res = {}
92
+ process(texts).each do |name, entities|
93
+
94
+ segments = entities.collect do |entity|
95
+ start, eend, literal, type, code = entity.split(":")
96
+ literal.gsub!('·',':')
97
+
98
+ NamedEntity.setup(literal, :offset => start.to_i, :entity_type => type, :code => code)
99
+ end
100
+
101
+ res[name] = segments
102
+ end
103
+ res
104
+ end
75
105
  end
76
106
 
77
107
  if __FILE__ == $0