rbbt-text 1.3.1 → 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a6965ecde1b38d5bc93d4836ee6d757e2add39a51d64c2f06142bbbd303e22d7
4
- data.tar.gz: a5c32ea03ea8214dd8c94ef6e884b59e459e3a7a8e3d26065a0a046b5b9b4778
3
+ metadata.gz: 05b1cf1981e955652598dd3db811cf8e6a7d64b68535e21834012abe90efe388
4
+ data.tar.gz: 67017f8a10cbfae51664999218336d638ea6be7c29b5ec305872473672977a41
5
5
  SHA512:
6
- metadata.gz: 756d240a796e5ac88b4b55368e0e4e3af14b3dd2d8b8b55e49839c3cdc3fa45ee807d648cf86b45b62e7f2f4d9e7fc15567ab21d3356e37a5c3c4316cbcaa841
7
- data.tar.gz: 6caa03ec51185cac00cc436bac999b063fccfcc1dbf0e2c09359dad7171c0eea37f80436cc860038a2c1ad17eb9b67a03e88d1ae8ef406ce1c5c874d375d1abd
6
+ metadata.gz: 03b02dcea1040edfa653e976d9f2f808ed25f9e0164add2fc85afa4417cf8e10ff8dfb27e1927c457f0ff6c6ee90311765ac364b7c5d7c8d9fd51cfff4ab9434
7
+ data.tar.gz: a5c44f475241da67863ac33ea446e7dbc64283ca53d6642c7c67c1c6e2e34a5d28b1ad678d2a5a44bf3316ed3c063f2d084628b8ff4d79aee8be04db3f8a6ab1
@@ -87,5 +87,3 @@ class String
87
87
  BagOfWords.bigrams(self)
88
88
  end
89
89
  end
90
-
91
-
@@ -95,7 +95,7 @@ class Dictionary::TF_IDF
95
95
  }
96
96
 
97
97
  if limit
98
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
98
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
99
99
  else
100
100
  Hash[*best.flatten]
101
101
  end
@@ -177,7 +177,7 @@ class Dictionary::KL
177
177
  best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
178
178
  }
179
179
  if limit
180
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
180
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
181
181
  else
182
182
  best
183
183
  end
@@ -1,6 +1,5 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/entity'
3
- require 'rbbt/document/annotation'
4
3
 
5
4
  module DocID
6
5
  extend Entity
@@ -19,10 +18,21 @@ module DocID
19
18
  DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
20
19
  end
21
20
 
22
- def document
23
- text = self.corpus[self]
24
- namespace, id, type = self.split(":")
25
- Document.setup(text, namespace, id, type, :corpus => corpus)
21
+ property :document => :both do
22
+ if Array === self
23
+ namespace, id, type = nil, nil, nil
24
+ docs = self.collect do |docid|
25
+ text = self.corpus[docid]
26
+ namespace, id, type = docid.split(":")
27
+ #Document.setup(text, namespace, id, type, :corpus => corpus)
28
+ text
29
+ end
30
+ Document.setup(docs, :corpus => corpus)
31
+ else
32
+ text = self.corpus[self]
33
+ namespace, id, type = self.split(":")
34
+ Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
35
+ end
26
36
  end
27
37
  end
28
38
 
@@ -1,3 +1,4 @@
1
+ require 'rbbt/segment'
1
2
  require 'rbbt/segment/annotation'
2
3
 
3
4
  module Document
@@ -22,17 +23,19 @@ module Document
22
23
  send :property, type => :multiple do |list|
23
24
  doc_segments = self.instance_exec list, &block
24
25
 
25
- doc_segments = doc_segments.chunked_values_at(self) if Hash === doc_segments
26
+ doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
26
27
 
27
28
  doc_segments.each_with_index do |segments,i|
29
+ next if segments.nil?
28
30
  document = list[i]
29
- Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
31
+ Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
30
32
 
31
33
  segments.each do |segment|
32
34
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
33
35
  end
34
36
 
35
37
  docid = document.docid
38
+
36
39
  segments.each{|s| s.docid = docid if s.docid.nil? }
37
40
 
38
41
  segments
@@ -3,17 +3,40 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
- corpus.extend Document::Corpus
6
+ corpus.extend Document::Corpus unless Document::Corpus === corpus
7
+ corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
8
+ corpus
7
9
  end
8
10
 
9
11
  def add_document(document)
10
- self[document.docid] = document
12
+ docid = document.docid
13
+ return document if self.include?(docid)
14
+ self.write_and_close do
15
+ self[docid] = document
16
+ end
17
+ end
18
+
19
+ def docids(prefix)
20
+ prefix += ":" unless prefix[-1] == ":"
21
+ docids = self.read_and_close do
22
+ self.prefix(prefix)
23
+ end
24
+ DocID.setup(docids, :corpus => self)
25
+ end
26
+
27
+ def documents(prefix)
28
+ self.docids(prefix).document
11
29
  end
12
30
 
13
31
  def [](*args)
14
32
  docid, *rest = args
15
- res = super(*args)
33
+
34
+ res = self.read_and_close do
35
+ super(*args)
36
+ end
37
+
16
38
  return res if args.length > 1
39
+
17
40
  namespace, id, type = docid.split(":")
18
41
 
19
42
  if res.nil?
@@ -6,7 +6,6 @@ module Document::Corpus
6
6
  type = nil if String === type and type.empty?
7
7
 
8
8
  res = PubMed.get_article(pmids).collect do |pmid, article|
9
- Log.debug "Loading pmid #{pmid}"
10
9
  document = if type.nil? || type.to_sym == :abstract
11
10
  Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
12
11
  elsif type.to_sym == :title
@@ -15,6 +14,7 @@ module Document::Corpus
15
14
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
16
15
  Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
17
16
  end
17
+ Log.debug "Loading pmid #{pmid}"
18
18
  add_document(document)
19
19
  end
20
20
 
@@ -55,11 +55,16 @@ EOF
55
55
  Open.mkdir 'tmp'
56
56
 
57
57
  texts.each do |name,text|
58
+ text = Misc.fixutf8(text)
59
+
60
+ text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
61
+
58
62
  Open.write("input/#{name}.txt") do |f|
59
- f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
63
+ f.puts "#{name}|a|" << text
60
64
  f.puts
61
65
  end
62
66
  end
67
+
63
68
  Open.write('config', CONFIG)
64
69
  CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
65
70
 
@@ -95,6 +100,7 @@ EOF
95
100
 
96
101
  res[name] = segments
97
102
  end
103
+ res
98
104
  end
99
105
  end
100
106
 
@@ -15,7 +15,6 @@ class PatternRelExt
15
15
  segments = sentence.segments
16
16
  segments = segments.values.flatten if Hash === segments
17
17
  Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
18
- ppp sentence
19
18
  regexpNER.entities(sentence)
20
19
  end
21
20
  end
@@ -5,15 +5,27 @@ require 'rbbt/ner/NER'
5
5
  require 'rbbt/segment/token'
6
6
 
7
7
  class TokenTrieNER < NER
8
- def self.clean(token)
8
+ def self.clean(token, stem = false)
9
9
  if token.length > 3
10
- token.downcase.sub(/-/,'')
10
+ upcase = token !~ /[a-z]/
11
+ token = token.downcase.sub(/-/,'')
12
+
13
+ if stem && ! upcase
14
+ require 'stemmer'
15
+ if stem == :double
16
+ token = token.stem.stem
17
+ else
18
+ token = token.stem
19
+ end
20
+ end
21
+
22
+ token
11
23
  else
12
24
  token
13
25
  end
14
26
  end
15
27
 
16
- def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
28
+ def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
17
29
  if no_clean
18
30
  if extend_to_token
19
31
  Token.setup(token, :offset => start, :original => token)
@@ -22,25 +34,25 @@ class TokenTrieNER < NER
22
34
  end
23
35
  else
24
36
  if extend_to_token
25
- Token.setup(clean(token), :offset => start, :original => token)
37
+ Token.setup(clean(token, stem), :offset => start, :original => token)
26
38
  else
27
- clean(token)
39
+ clean(token, stem)
28
40
  end
29
41
  end
30
42
  end
31
43
 
32
- def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
44
+ def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
33
45
  split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
34
46
 
35
47
  tokens = []
36
48
  while matchdata = text.match(split_at)
37
- tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
38
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
49
+ tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
50
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
39
51
  start += matchdata.end(0)
40
52
  text = matchdata.post_match
41
53
  end
42
54
 
43
- tokens << prepare_token(text, start, extend_to_token) unless text.empty?
55
+ tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
44
56
 
45
57
  tokens
46
58
  end
@@ -130,7 +142,7 @@ class TokenTrieNER < NER
130
142
  index1
131
143
  end
132
144
 
133
- def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
145
+ def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
134
146
 
135
147
  chunk_size = hash.size / 100
136
148
  items_in_chunk = 0
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
146
158
  names.each do |name|
147
159
  next if name.empty? or (String === name and name.length < 2)
148
160
 
149
- tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
161
+ tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
150
162
  tokens.extend EnumeratedArray
151
163
 
152
164
  token_index = index_for_tokens(tokens, code, type, slack)
@@ -240,7 +252,7 @@ class TokenTrieNER < NER
240
252
  NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
241
253
  end
242
254
 
243
- attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
255
+ attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
244
256
  def initialize(type = nil, file = nil, options = {})
245
257
  options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
246
258
  :persist => false
@@ -248,6 +260,7 @@ class TokenTrieNER < NER
248
260
  @longest_match = options.delete :longest_match
249
261
  @split_at = options.delete :split_at
250
262
  @no_clean = options.delete :no_clean
263
+ @stem = options.delete :stem
251
264
 
252
265
  file = [] if file.nil?
253
266
  file = [file] unless Array === file
@@ -273,7 +286,7 @@ class TokenTrieNER < NER
273
286
  Log.debug "TokenTrieNER merging TSV"
274
287
  new.with_unnamed do
275
288
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
276
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
289
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
277
290
  end
278
291
  end
279
292
  when Hash === new
@@ -284,14 +297,14 @@ class TokenTrieNER < NER
284
297
  new = TSV.open(new, :flat)
285
298
  new.with_unnamed do
286
299
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
287
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
300
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
288
301
  end
289
302
  end
290
303
  end
291
304
  end
292
305
 
293
306
  def match(text)
294
- tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
307
+ tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
295
308
 
296
309
  tokens.extend EnumeratedArray
297
310
  tokens.pos = 0
@@ -239,6 +239,7 @@ module NLP
239
239
  end
240
240
 
241
241
  def self.geniass_sentence_splitter(text)
242
+ Rbbt.software.opt.Geniass.produce
242
243
  offsets = []
243
244
 
244
245
  cleaned = text.gsub("\n",NEW_LINE_MASK)
@@ -294,7 +295,7 @@ module NLP
294
295
  offsets.collect do |s,e|
295
296
  sentence = text[s..e]
296
297
  next if sentence.nil?
297
- #sentence.gsub!(NEW_LINE_MASK, "\n")
298
+ sentence.gsub!(NEW_LINE_MASK, "\n")
298
299
  Segment.setup sentence, s
299
300
  sentence
300
301
  end
@@ -0,0 +1,52 @@
1
+ require 'rbbt/segment'
2
+ require 'rbbt/document'
3
+ require 'rbbt/segment/annotation'
4
+ require 'rbbt/util/python'
5
+
6
+ module SpaCy
7
+
8
+ PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
9
+
10
+ def self.tokens(text, lang = 'en')
11
+
12
+ tokens = []
13
+ RbbtPython.run 'spacy' do
14
+ nlp = spacy.load(lang)
15
+ doc = nlp.call(text)
16
+ doc.__len__.times do |i|
17
+ tokens << doc.__getitem__(i)
18
+ end
19
+ end
20
+ tokens
21
+ end
22
+
23
+ def self.segments(text, lang = 'en')
24
+ docid = text.docid if Document === text
25
+ corpus = text.corpus if Document === text
26
+ tokens = self.tokens(text, lang).collect do |token|
27
+ info = {}
28
+ PROPERTIES.each do |p|
29
+ info[p] = token.instance_eval(p.to_s)
30
+ end
31
+ info[:type] = "SpaCy"
32
+ info[:offset] = token.idx
33
+ info[:dep] = token.dep_ + "->" + token.head.idx.to_s
34
+ info[:docid] = docid if docid
35
+ info[:corpus] = corpus if corpus
36
+ SpaCyToken.setup(token.text, info)
37
+ end
38
+ SpaCyToken.setup(tokens, :corpus => corpus)
39
+ end
40
+ end
41
+
42
+ module SpaCyToken
43
+ extend Entity
44
+ include SegmentAnnotation
45
+
46
+ self.annotation *SpaCy::PROPERTIES
47
+ self.annotation :dep
48
+ end
49
+
50
+ if __FILE__ == $0
51
+ ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
52
+ end
@@ -1,5 +1,6 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/entity'
3
+ require 'rbbt/document'
3
4
 
4
5
  module SegID
5
6
  extend Entity
@@ -10,11 +11,11 @@ module SegID
10
11
  end
11
12
 
12
13
  def range
13
- @range ||= Range.new(*_parts.last.split("..").map(&:to_i))
14
+ @range ||= Range.new(*_parts[4].split("..").map(&:to_i))
14
15
  end
15
16
 
16
17
  def docid
17
- @docid ||= _parts[0..3] * ":"
18
+ @docid ||= DocID.setup(_parts[0..3] * ":")
18
19
  end
19
20
 
20
21
  def offset
@@ -25,12 +26,13 @@ module SegID
25
26
  range.end - range.begin + 1
26
27
  end
27
28
 
28
- property :segment do
29
+ property :segment => :single do
30
+ docid = self.docid
29
31
  document = DocID.setup(docid, :corpus => corpus).document
30
32
 
31
33
  text = document[range]
32
34
 
33
- Segment.setup(text, docid)
35
+ Segment.setup(text, :docid => docid, :offset => offset)
34
36
  end
35
37
 
36
38
  property :segid do
@@ -1,6 +1,6 @@
1
1
  require 'rbbt-util'
2
- require 'rbbt/entity'
3
2
  require 'rbbt/segment'
3
+ require 'rbbt/entity'
4
4
 
5
5
  module AnnotID
6
6
  extend Entity
@@ -32,7 +32,7 @@ end
32
32
 
33
33
  module SegmentAnnotation
34
34
  extend Entity
35
- include Segment
35
+ include Object::Segment
36
36
  self.annotation :type
37
37
 
38
38
  property :segid do
@@ -47,7 +47,7 @@ module SegmentAnnotation
47
47
  end
48
48
 
49
49
  property :annotid do |corpus=nil|
50
- AnnotID.setup([segid, type] * ":", :corpus => corpus)
50
+ AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
51
51
  end
52
52
 
53
53
  alias id annotid
@@ -0,0 +1,7 @@
1
+ module Relationship
2
+ extend Entity
3
+
4
+ self.annotation :segments
5
+ self.annotation :type
6
+
7
+ end
@@ -69,8 +69,8 @@ module Transformed
69
69
  segments = [segments] unless Array === segments
70
70
  orig_length = self.length
71
71
 
72
- offset = self.respond_to?(:offset) ? self.offset : 0
73
- segments = segments.select{|s| s.offset >= offset && s.offset <= offset + self.length - 1 }
72
+ offset = self.respond_to?(:offset) ? self.offset.to_i : 0
73
+ segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
74
74
 
75
75
  Segment.clean_sort(segments).each do |segment|
76
76
  next if segment.offset.nil?
@@ -1,7 +1,7 @@
1
1
  #!/bin/bash
2
2
 
3
3
  name='OpenNLP'
4
- url="http://apache.rediris.es/opennlp/opennlp-1.9.1/apache-opennlp-1.9.1-bin.tar.gz"
4
+ url="http://apache.rediris.es/opennlp/opennlp-1.9.2/apache-opennlp-1.9.2-bin.tar.gz"
5
5
 
6
6
  get_src "$name" "$url"
7
7
  move_opt "$name"
@@ -36,7 +36,7 @@ class TestAnnotation < Test::Unit::TestCase
36
36
  Document.setup(text, "TEST", "test_doc1", nil)
37
37
 
38
38
  corpus = {}
39
- corpus.extend Document::Corpus
39
+ Document::Corpus.setup corpus
40
40
 
41
41
  corpus.add_document(text)
42
42
 
@@ -50,7 +50,7 @@ class TestAnnotation < Test::Unit::TestCase
50
50
  Document.setup(text2, "TEST", "test_doc2", nil)
51
51
 
52
52
  corpus = {}
53
- corpus.extend Document::Corpus
53
+ Document::Corpus.setup corpus
54
54
 
55
55
  corpus.add_document(text1)
56
56
  corpus.add_document(text2)
@@ -68,7 +68,7 @@ class TestAnnotation < Test::Unit::TestCase
68
68
  Document.setup(text, "TEST", "test_doc1", nil)
69
69
 
70
70
  corpus = {}
71
- corpus.extend Document::Corpus
71
+ Document::Corpus.setup corpus
72
72
 
73
73
  corpus.add_document(text)
74
74
 
@@ -95,7 +95,7 @@ class TestAnnotation < Test::Unit::TestCase
95
95
  Document.setup(text, "TEST", "test_doc1", nil)
96
96
 
97
97
  corpus = {}
98
- corpus.extend Document::Corpus
98
+ Document::Corpus.setup corpus
99
99
 
100
100
  corpus.add_document(text)
101
101
 
@@ -122,7 +122,7 @@ class TestAnnotation < Test::Unit::TestCase
122
122
  Document.setup(text, "TEST", "test_doc1", nil)
123
123
 
124
124
  corpus = {}
125
- corpus.extend Document::Corpus
125
+ Document::Corpus.setup corpus
126
126
 
127
127
  corpus.add_document(text)
128
128
 
@@ -26,7 +26,7 @@ class TestDocumentCorpus < Test::Unit::TestCase
26
26
 
27
27
  corpus.add_document(text)
28
28
 
29
- assert corpus.prefix("TEST:").include?(text.docid)
29
+ assert corpus.docids("TEST:").include?(text.docid)
30
30
  end
31
31
  end
32
32
  end
@@ -5,12 +5,17 @@ Log.severity = 0
5
5
  class TestGNormPlus < Test::Unit::TestCase
6
6
  def test_match
7
7
  text =<<-EOF
8
- We found that TP53 is regulated by MDM2 in Homo sapiens
8
+
9
+ Introduction
10
+
11
+ We found that TP53 is regulated by MDM2 in Homo
12
+ sapiens
9
13
  EOF
10
14
 
11
15
  mentions = GNormPlus.process({:file => text})
16
+
12
17
  assert_equal 1, mentions.length
13
- assert_equal 2, mentions["file"].length
18
+ assert_equal 3, mentions["file"].length
14
19
  end
15
20
 
16
21
  def test_entities
@@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
19
24
  EOF
20
25
 
21
26
  mentions = GNormPlus.entities({:file => text})
22
- mentions["file"].include? "TP53"
27
+ assert mentions["file"].include?("TP53")
28
+ mentions["file"].each do |mention|
29
+ assert_equal mention, text[mention.range].sub("\n", ' ')
30
+ end
23
31
  end
24
32
  end
25
33
 
@@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase
7
7
  This is a sentence.
8
8
  A funky character ™ in a sentence.
9
9
  This is a sentence.
10
- This is a
10
+ This is a broken
11
11
  sentence. This is
12
- another sentence.
12
+ another broken sentence.
13
13
  EOF
14
14
 
15
- assert_equal "This is a \nsentence.", NLP.geniass_sentence_splitter(text)[3]
15
+ iii NLP.geniass_sentence_splitter(text)
16
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
16
17
  end
17
18
 
19
+ def test_sentences_2
20
+ text =<<-EOF
21
+ This is a sentence.
22
+ This is a sentence.
23
+ This is a broken
24
+ sentence. This is
25
+ another broken sentence.
26
+ EOF
27
+
28
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
29
+ end
30
+
31
+ def test_sentences_ext
32
+ text =<<-EOF
33
+ This is a sentence.
34
+ This is a sentence.
35
+ This is a broken
36
+ sentence. This is
37
+ another broken sentence.
38
+ EOF
39
+
40
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
41
+ end
18
42
  end
19
43
 
@@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase
12
12
  segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
13
13
  annotation = SegmentAnnotation.setup(segment, :type => :verb)
14
14
 
15
- assert_equal 'verb', annotation.annotid.split(":").last
15
+ assert_equal 'verb', annotation.annotid.split(":")[5]
16
16
 
17
17
  annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
18
- assert_equal 'verb', annotation.annotid.split(":").last
18
+ assert_equal 'verb', annotation.annotid.split(":")[5]
19
19
  end
20
20
 
21
21
  def test_annotid
22
22
  text = "This is a document"
23
23
  Document.setup(text, "TEST", "test_doc1", nil)
24
24
 
25
- corpus = {}
26
- corpus.extend Document::Corpus
25
+ corpus = Document::Corpus.setup({})
27
26
 
28
27
  corpus.add_document(text)
29
28
 
@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
2
2
  require 'rbbt/segment/encoding'
3
3
 
4
4
  class TestEncoding < Test::Unit::TestCase
5
- def _test_bad_chars
5
+ def test_bad_chars
6
6
  text = "A funky character ™ in a sentence."
7
7
 
8
8
  assert_equal ["™"], Segment.bad_chars(text)
@@ -22,12 +22,13 @@ class TestClass < Test::Unit::TestCase
22
22
  assert_equal "SCORE", a.score
23
23
  end
24
24
 
25
- def __test_tsv
25
+ def test_tsv
26
26
  a = "test"
27
27
  NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
28
- assert Segment.tsv([a]).fields.include? "code"
29
- assert Segment.tsv([a], nil).fields.include? "code"
30
- assert Segment.tsv([a], "literal").fields.include? "code"
28
+ assert Annotated.tsv([a]).fields.include? "code"
29
+ assert Annotated.tsv([a], nil).fields.include? "code"
30
+ assert Annotated.tsv([a], :all).fields.include? "code"
31
+ assert Annotated.tsv([a], :all).fields.include? "literal"
31
32
  end
32
33
 
33
34
  def __test_segment_brat
@@ -9,8 +9,7 @@ class TestRangeIndex < Test::Unit::TestCase
9
9
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
10
10
  Document.setup(text, "TEST", "test_doc1", nil)
11
11
 
12
- corpus = {}
13
- corpus.extend Document::Corpus
12
+ corpus = Document::Corpus.setup({})
14
13
 
15
14
  corpus.add_document(text)
16
15
 
@@ -17,8 +17,7 @@ class TestSegment < Test::Unit::TestCase
17
17
  text = "This is a document"
18
18
  Document.setup(text, "TEST", "test_doc1", nil)
19
19
 
20
- corpus = {}
21
- corpus.extend Document::Corpus
20
+ corpus = Document::Corpus.setup({})
22
21
 
23
22
  corpus.add_document(text)
24
23
 
@@ -41,8 +40,7 @@ class TestSegment < Test::Unit::TestCase
41
40
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
42
41
  Document.setup(text, "TEST", "test_doc1", nil)
43
42
 
44
- corpus = {}
45
- corpus.extend Document::Corpus
43
+ corpus = Document::Corpus.setup({})
46
44
 
47
45
  corpus.add_document(text)
48
46
 
@@ -65,8 +63,7 @@ class TestSegment < Test::Unit::TestCase
65
63
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
66
64
  Document.setup(text, "TEST", "test_doc1", nil)
67
65
 
68
- corpus = {}
69
- corpus.extend Document::Corpus
66
+ corpus = Document::Corpus.setup({})
70
67
 
71
68
  corpus.add_document(text)
72
69
 
@@ -94,8 +91,7 @@ class TestSegment < Test::Unit::TestCase
94
91
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
95
92
  Document.setup(text, "TEST", "test_doc1", nil)
96
93
 
97
- corpus = {}
98
- corpus.extend Document::Corpus
94
+ corpus = Document::Corpus.setup({})
99
95
 
100
96
  corpus.add_document(text)
101
97
 
@@ -142,8 +138,7 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
142
138
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
143
139
  Document.setup(text, "TEST", "test_doc1", nil)
144
140
 
145
- corpus = {}
146
- corpus.extend Document::Corpus
141
+ corpus = Document::Corpus.setup({})
147
142
 
148
143
  corpus.add_document(text)
149
144
 
@@ -0,0 +1,32 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
2
+ require 'rbbt/nlp/spaCy'
3
+ require 'rbbt/document/corpus'
4
+
5
+ class TestSpaCy < Test::Unit::TestCase
6
+ def _test_tokens
7
+ text = "I tell a story"
8
+
9
+ tokens = SpaCy.tokens(text)
10
+
11
+ assert_equal 4, tokens.length
12
+ assert_equal "tell", tokens[1].to_s
13
+ end
14
+
15
+ def test_segments
16
+ text = "I tell a story. It's a very good story."
17
+
18
+ corpus = Document::Corpus.setup({})
19
+
20
+ Document.setup(text, "TEST", "test_doc1", "simple_sentence")
21
+
22
+ corpus.add_document text
23
+ text.corpus = corpus
24
+
25
+ segments = SpaCy.segments(text)
26
+
27
+ segments.each do |segment|
28
+ assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
29
+ end
30
+ end
31
+ end
32
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.1
4
+ version: 1.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-19 00:00:00.000000000 Z
11
+ date: 2020-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -102,12 +102,14 @@ files:
102
102
  - lib/rbbt/nlp/genia/sentence_splitter.rb
103
103
  - lib/rbbt/nlp/nlp.rb
104
104
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
105
+ - lib/rbbt/nlp/spaCy.rb
105
106
  - lib/rbbt/segment.rb
106
107
  - lib/rbbt/segment/annotation.rb
107
108
  - lib/rbbt/segment/encoding.rb
108
109
  - lib/rbbt/segment/named_entity.rb
109
110
  - lib/rbbt/segment/overlaps.rb
110
111
  - lib/rbbt/segment/range_index.rb
112
+ - lib/rbbt/segment/relationship.rb
111
113
  - lib/rbbt/segment/segmented.rb
112
114
  - lib/rbbt/segment/token.rb
113
115
  - lib/rbbt/segment/transformed.rb
@@ -161,6 +163,7 @@ files:
161
163
  - test/rbbt/test_document.rb
162
164
  - test/rbbt/test_segment.rb
163
165
  - test/test_helper.rb
166
+ - test/test_spaCy.rb
164
167
  homepage: http://github.com/mikisvaz/rbbt-util
165
168
  licenses: []
166
169
  metadata: {}
@@ -217,4 +220,5 @@ test_files:
217
220
  - test/rbbt/segment/test_encoding.rb
218
221
  - test/rbbt/segment/test_range_index.rb
219
222
  - test/rbbt/segment/test_corpus.rb
223
+ - test/test_spaCy.rb
220
224
  - test/test_helper.rb