rbbt-text 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a6965ecde1b38d5bc93d4836ee6d757e2add39a51d64c2f06142bbbd303e22d7
4
- data.tar.gz: a5c32ea03ea8214dd8c94ef6e884b59e459e3a7a8e3d26065a0a046b5b9b4778
3
+ metadata.gz: 05b1cf1981e955652598dd3db811cf8e6a7d64b68535e21834012abe90efe388
4
+ data.tar.gz: 67017f8a10cbfae51664999218336d638ea6be7c29b5ec305872473672977a41
5
5
  SHA512:
6
- metadata.gz: 756d240a796e5ac88b4b55368e0e4e3af14b3dd2d8b8b55e49839c3cdc3fa45ee807d648cf86b45b62e7f2f4d9e7fc15567ab21d3356e37a5c3c4316cbcaa841
7
- data.tar.gz: 6caa03ec51185cac00cc436bac999b063fccfcc1dbf0e2c09359dad7171c0eea37f80436cc860038a2c1ad17eb9b67a03e88d1ae8ef406ce1c5c874d375d1abd
6
+ metadata.gz: 03b02dcea1040edfa653e976d9f2f808ed25f9e0164add2fc85afa4417cf8e10ff8dfb27e1927c457f0ff6c6ee90311765ac364b7c5d7c8d9fd51cfff4ab9434
7
+ data.tar.gz: a5c44f475241da67863ac33ea446e7dbc64283ca53d6642c7c67c1c6e2e34a5d28b1ad678d2a5a44bf3316ed3c063f2d084628b8ff4d79aee8be04db3f8a6ab1
@@ -87,5 +87,3 @@ class String
87
87
  BagOfWords.bigrams(self)
88
88
  end
89
89
  end
90
-
91
-
@@ -95,7 +95,7 @@ class Dictionary::TF_IDF
95
95
  }
96
96
 
97
97
  if limit
98
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
98
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
99
99
  else
100
100
  Hash[*best.flatten]
101
101
  end
@@ -177,7 +177,7 @@ class Dictionary::KL
177
177
  best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
178
178
  }
179
179
  if limit
180
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
180
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
181
181
  else
182
182
  best
183
183
  end
@@ -1,6 +1,5 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/entity'
3
- require 'rbbt/document/annotation'
4
3
 
5
4
  module DocID
6
5
  extend Entity
@@ -19,10 +18,21 @@ module DocID
19
18
  DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
20
19
  end
21
20
 
22
- def document
23
- text = self.corpus[self]
24
- namespace, id, type = self.split(":")
25
- Document.setup(text, namespace, id, type, :corpus => corpus)
21
+ property :document => :both do
22
+ if Array === self
23
+ namespace, id, type = nil, nil, nil
24
+ docs = self.collect do |docid|
25
+ text = self.corpus[docid]
26
+ namespace, id, type = docid.split(":")
27
+ #Document.setup(text, namespace, id, type, :corpus => corpus)
28
+ text
29
+ end
30
+ Document.setup(docs, :corpus => corpus)
31
+ else
32
+ text = self.corpus[self]
33
+ namespace, id, type = self.split(":")
34
+ Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
35
+ end
26
36
  end
27
37
  end
28
38
 
@@ -1,3 +1,4 @@
1
+ require 'rbbt/segment'
1
2
  require 'rbbt/segment/annotation'
2
3
 
3
4
  module Document
@@ -22,17 +23,19 @@ module Document
22
23
  send :property, type => :multiple do |list|
23
24
  doc_segments = self.instance_exec list, &block
24
25
 
25
- doc_segments = doc_segments.chunked_values_at(self) if Hash === doc_segments
26
+ doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
26
27
 
27
28
  doc_segments.each_with_index do |segments,i|
29
+ next if segments.nil?
28
30
  document = list[i]
29
- Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
31
+ Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
30
32
 
31
33
  segments.each do |segment|
32
34
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
33
35
  end
34
36
 
35
37
  docid = document.docid
38
+
36
39
  segments.each{|s| s.docid = docid if s.docid.nil? }
37
40
 
38
41
  segments
@@ -3,17 +3,40 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
- corpus.extend Document::Corpus
6
+ corpus.extend Document::Corpus unless Document::Corpus === corpus
7
+ corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
8
+ corpus
7
9
  end
8
10
 
9
11
  def add_document(document)
10
- self[document.docid] = document
12
+ docid = document.docid
13
+ return document if self.include?(docid)
14
+ self.write_and_close do
15
+ self[docid] = document
16
+ end
17
+ end
18
+
19
+ def docids(prefix)
20
+ prefix += ":" unless prefix[-1] == ":"
21
+ docids = self.read_and_close do
22
+ self.prefix(prefix)
23
+ end
24
+ DocID.setup(docids, :corpus => self)
25
+ end
26
+
27
+ def documents(prefix)
28
+ self.docids(prefix).document
11
29
  end
12
30
 
13
31
  def [](*args)
14
32
  docid, *rest = args
15
- res = super(*args)
33
+
34
+ res = self.read_and_close do
35
+ super(*args)
36
+ end
37
+
16
38
  return res if args.length > 1
39
+
17
40
  namespace, id, type = docid.split(":")
18
41
 
19
42
  if res.nil?
@@ -6,7 +6,6 @@ module Document::Corpus
6
6
  type = nil if String === type and type.empty?
7
7
 
8
8
  res = PubMed.get_article(pmids).collect do |pmid, article|
9
- Log.debug "Loading pmid #{pmid}"
10
9
  document = if type.nil? || type.to_sym == :abstract
11
10
  Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
12
11
  elsif type.to_sym == :title
@@ -15,6 +14,7 @@ module Document::Corpus
15
14
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
16
15
  Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
17
16
  end
17
+ Log.debug "Loading pmid #{pmid}"
18
18
  add_document(document)
19
19
  end
20
20
 
@@ -55,11 +55,16 @@ EOF
55
55
  Open.mkdir 'tmp'
56
56
 
57
57
  texts.each do |name,text|
58
+ text = Misc.fixutf8(text)
59
+
60
+ text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
61
+
58
62
  Open.write("input/#{name}.txt") do |f|
59
- f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
63
+ f.puts "#{name}|a|" << text
60
64
  f.puts
61
65
  end
62
66
  end
67
+
63
68
  Open.write('config', CONFIG)
64
69
  CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
65
70
 
@@ -95,6 +100,7 @@ EOF
95
100
 
96
101
  res[name] = segments
97
102
  end
103
+ res
98
104
  end
99
105
  end
100
106
 
@@ -15,7 +15,6 @@ class PatternRelExt
15
15
  segments = sentence.segments
16
16
  segments = segments.values.flatten if Hash === segments
17
17
  Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
18
- ppp sentence
19
18
  regexpNER.entities(sentence)
20
19
  end
21
20
  end
@@ -5,15 +5,27 @@ require 'rbbt/ner/NER'
5
5
  require 'rbbt/segment/token'
6
6
 
7
7
  class TokenTrieNER < NER
8
- def self.clean(token)
8
+ def self.clean(token, stem = false)
9
9
  if token.length > 3
10
- token.downcase.sub(/-/,'')
10
+ upcase = token !~ /[a-z]/
11
+ token = token.downcase.sub(/-/,'')
12
+
13
+ if stem && ! upcase
14
+ require 'stemmer'
15
+ if stem == :double
16
+ token = token.stem.stem
17
+ else
18
+ token = token.stem
19
+ end
20
+ end
21
+
22
+ token
11
23
  else
12
24
  token
13
25
  end
14
26
  end
15
27
 
16
- def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
28
+ def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
17
29
  if no_clean
18
30
  if extend_to_token
19
31
  Token.setup(token, :offset => start, :original => token)
@@ -22,25 +34,25 @@ class TokenTrieNER < NER
22
34
  end
23
35
  else
24
36
  if extend_to_token
25
- Token.setup(clean(token), :offset => start, :original => token)
37
+ Token.setup(clean(token, stem), :offset => start, :original => token)
26
38
  else
27
- clean(token)
39
+ clean(token, stem)
28
40
  end
29
41
  end
30
42
  end
31
43
 
32
- def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
44
+ def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
33
45
  split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
34
46
 
35
47
  tokens = []
36
48
  while matchdata = text.match(split_at)
37
- tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
38
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
49
+ tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
50
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
39
51
  start += matchdata.end(0)
40
52
  text = matchdata.post_match
41
53
  end
42
54
 
43
- tokens << prepare_token(text, start, extend_to_token) unless text.empty?
55
+ tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
44
56
 
45
57
  tokens
46
58
  end
@@ -130,7 +142,7 @@ class TokenTrieNER < NER
130
142
  index1
131
143
  end
132
144
 
133
- def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
145
+ def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
134
146
 
135
147
  chunk_size = hash.size / 100
136
148
  items_in_chunk = 0
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
146
158
  names.each do |name|
147
159
  next if name.empty? or (String === name and name.length < 2)
148
160
 
149
- tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
161
+ tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
150
162
  tokens.extend EnumeratedArray
151
163
 
152
164
  token_index = index_for_tokens(tokens, code, type, slack)
@@ -240,7 +252,7 @@ class TokenTrieNER < NER
240
252
  NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
241
253
  end
242
254
 
243
- attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
255
+ attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
244
256
  def initialize(type = nil, file = nil, options = {})
245
257
  options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
246
258
  :persist => false
@@ -248,6 +260,7 @@ class TokenTrieNER < NER
248
260
  @longest_match = options.delete :longest_match
249
261
  @split_at = options.delete :split_at
250
262
  @no_clean = options.delete :no_clean
263
+ @stem = options.delete :stem
251
264
 
252
265
  file = [] if file.nil?
253
266
  file = [file] unless Array === file
@@ -273,7 +286,7 @@ class TokenTrieNER < NER
273
286
  Log.debug "TokenTrieNER merging TSV"
274
287
  new.with_unnamed do
275
288
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
276
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
289
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
277
290
  end
278
291
  end
279
292
  when Hash === new
@@ -284,14 +297,14 @@ class TokenTrieNER < NER
284
297
  new = TSV.open(new, :flat)
285
298
  new.with_unnamed do
286
299
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
287
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
300
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
288
301
  end
289
302
  end
290
303
  end
291
304
  end
292
305
 
293
306
  def match(text)
294
- tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
307
+ tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
295
308
 
296
309
  tokens.extend EnumeratedArray
297
310
  tokens.pos = 0
@@ -239,6 +239,7 @@ module NLP
239
239
  end
240
240
 
241
241
  def self.geniass_sentence_splitter(text)
242
+ Rbbt.software.opt.Geniass.produce
242
243
  offsets = []
243
244
 
244
245
  cleaned = text.gsub("\n",NEW_LINE_MASK)
@@ -294,7 +295,7 @@ module NLP
294
295
  offsets.collect do |s,e|
295
296
  sentence = text[s..e]
296
297
  next if sentence.nil?
297
- #sentence.gsub!(NEW_LINE_MASK, "\n")
298
+ sentence.gsub!(NEW_LINE_MASK, "\n")
298
299
  Segment.setup sentence, s
299
300
  sentence
300
301
  end
@@ -0,0 +1,52 @@
1
+ require 'rbbt/segment'
2
+ require 'rbbt/document'
3
+ require 'rbbt/segment/annotation'
4
+ require 'rbbt/util/python'
5
+
6
+ module SpaCy
7
+
8
+ PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
9
+
10
+ def self.tokens(text, lang = 'en')
11
+
12
+ tokens = []
13
+ RbbtPython.run 'spacy' do
14
+ nlp = spacy.load(lang)
15
+ doc = nlp.call(text)
16
+ doc.__len__.times do |i|
17
+ tokens << doc.__getitem__(i)
18
+ end
19
+ end
20
+ tokens
21
+ end
22
+
23
+ def self.segments(text, lang = 'en')
24
+ docid = text.docid if Document === text
25
+ corpus = text.corpus if Document === text
26
+ tokens = self.tokens(text, lang).collect do |token|
27
+ info = {}
28
+ PROPERTIES.each do |p|
29
+ info[p] = token.instance_eval(p.to_s)
30
+ end
31
+ info[:type] = "SpaCy"
32
+ info[:offset] = token.idx
33
+ info[:dep] = token.dep_ + "->" + token.head.idx.to_s
34
+ info[:docid] = docid if docid
35
+ info[:corpus] = corpus if corpus
36
+ SpaCyToken.setup(token.text, info)
37
+ end
38
+ SpaCyToken.setup(tokens, :corpus => corpus)
39
+ end
40
+ end
41
+
42
+ module SpaCyToken
43
+ extend Entity
44
+ include SegmentAnnotation
45
+
46
+ self.annotation *SpaCy::PROPERTIES
47
+ self.annotation :dep
48
+ end
49
+
50
+ if __FILE__ == $0
51
+ ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
52
+ end
@@ -1,5 +1,6 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/entity'
3
+ require 'rbbt/document'
3
4
 
4
5
  module SegID
5
6
  extend Entity
@@ -10,11 +11,11 @@ module SegID
10
11
  end
11
12
 
12
13
  def range
13
- @range ||= Range.new(*_parts.last.split("..").map(&:to_i))
14
+ @range ||= Range.new(*_parts[4].split("..").map(&:to_i))
14
15
  end
15
16
 
16
17
  def docid
17
- @docid ||= _parts[0..3] * ":"
18
+ @docid ||= DocID.setup(_parts[0..3] * ":")
18
19
  end
19
20
 
20
21
  def offset
@@ -25,12 +26,13 @@ module SegID
25
26
  range.end - range.begin + 1
26
27
  end
27
28
 
28
- property :segment do
29
+ property :segment => :single do
30
+ docid = self.docid
29
31
  document = DocID.setup(docid, :corpus => corpus).document
30
32
 
31
33
  text = document[range]
32
34
 
33
- Segment.setup(text, docid)
35
+ Segment.setup(text, :docid => docid, :offset => offset)
34
36
  end
35
37
 
36
38
  property :segid do
@@ -1,6 +1,6 @@
1
1
  require 'rbbt-util'
2
- require 'rbbt/entity'
3
2
  require 'rbbt/segment'
3
+ require 'rbbt/entity'
4
4
 
5
5
  module AnnotID
6
6
  extend Entity
@@ -32,7 +32,7 @@ end
32
32
 
33
33
  module SegmentAnnotation
34
34
  extend Entity
35
- include Segment
35
+ include Object::Segment
36
36
  self.annotation :type
37
37
 
38
38
  property :segid do
@@ -47,7 +47,7 @@ module SegmentAnnotation
47
47
  end
48
48
 
49
49
  property :annotid do |corpus=nil|
50
- AnnotID.setup([segid, type] * ":", :corpus => corpus)
50
+ AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
51
51
  end
52
52
 
53
53
  alias id annotid
@@ -0,0 +1,7 @@
1
+ module Relationship
2
+ extend Entity
3
+
4
+ self.annotation :segments
5
+ self.annotation :type
6
+
7
+ end
@@ -69,8 +69,8 @@ module Transformed
69
69
  segments = [segments] unless Array === segments
70
70
  orig_length = self.length
71
71
 
72
- offset = self.respond_to?(:offset) ? self.offset : 0
73
- segments = segments.select{|s| s.offset >= offset && s.offset <= offset + self.length - 1 }
72
+ offset = self.respond_to?(:offset) ? self.offset.to_i : 0
73
+ segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
74
74
 
75
75
  Segment.clean_sort(segments).each do |segment|
76
76
  next if segment.offset.nil?
@@ -1,7 +1,7 @@
1
1
  #!/bin/bash
2
2
 
3
3
  name='OpenNLP'
4
- url="http://apache.rediris.es/opennlp/opennlp-1.9.1/apache-opennlp-1.9.1-bin.tar.gz"
4
+ url="http://apache.rediris.es/opennlp/opennlp-1.9.2/apache-opennlp-1.9.2-bin.tar.gz"
5
5
 
6
6
  get_src "$name" "$url"
7
7
  move_opt "$name"
@@ -36,7 +36,7 @@ class TestAnnotation < Test::Unit::TestCase
36
36
  Document.setup(text, "TEST", "test_doc1", nil)
37
37
 
38
38
  corpus = {}
39
- corpus.extend Document::Corpus
39
+ Document::Corpus.setup corpus
40
40
 
41
41
  corpus.add_document(text)
42
42
 
@@ -50,7 +50,7 @@ class TestAnnotation < Test::Unit::TestCase
50
50
  Document.setup(text2, "TEST", "test_doc2", nil)
51
51
 
52
52
  corpus = {}
53
- corpus.extend Document::Corpus
53
+ Document::Corpus.setup corpus
54
54
 
55
55
  corpus.add_document(text1)
56
56
  corpus.add_document(text2)
@@ -68,7 +68,7 @@ class TestAnnotation < Test::Unit::TestCase
68
68
  Document.setup(text, "TEST", "test_doc1", nil)
69
69
 
70
70
  corpus = {}
71
- corpus.extend Document::Corpus
71
+ Document::Corpus.setup corpus
72
72
 
73
73
  corpus.add_document(text)
74
74
 
@@ -95,7 +95,7 @@ class TestAnnotation < Test::Unit::TestCase
95
95
  Document.setup(text, "TEST", "test_doc1", nil)
96
96
 
97
97
  corpus = {}
98
- corpus.extend Document::Corpus
98
+ Document::Corpus.setup corpus
99
99
 
100
100
  corpus.add_document(text)
101
101
 
@@ -122,7 +122,7 @@ class TestAnnotation < Test::Unit::TestCase
122
122
  Document.setup(text, "TEST", "test_doc1", nil)
123
123
 
124
124
  corpus = {}
125
- corpus.extend Document::Corpus
125
+ Document::Corpus.setup corpus
126
126
 
127
127
  corpus.add_document(text)
128
128
 
@@ -26,7 +26,7 @@ class TestDocumentCorpus < Test::Unit::TestCase
26
26
 
27
27
  corpus.add_document(text)
28
28
 
29
- assert corpus.prefix("TEST:").include?(text.docid)
29
+ assert corpus.docids("TEST:").include?(text.docid)
30
30
  end
31
31
  end
32
32
  end
@@ -5,12 +5,17 @@ Log.severity = 0
5
5
  class TestGNormPlus < Test::Unit::TestCase
6
6
  def test_match
7
7
  text =<<-EOF
8
- We found that TP53 is regulated by MDM2 in Homo sapiens
8
+
9
+ Introduction
10
+
11
+ We found that TP53 is regulated by MDM2 in Homo
12
+ sapiens
9
13
  EOF
10
14
 
11
15
  mentions = GNormPlus.process({:file => text})
16
+
12
17
  assert_equal 1, mentions.length
13
- assert_equal 2, mentions["file"].length
18
+ assert_equal 3, mentions["file"].length
14
19
  end
15
20
 
16
21
  def test_entities
@@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
19
24
  EOF
20
25
 
21
26
  mentions = GNormPlus.entities({:file => text})
22
- mentions["file"].include? "TP53"
27
+ assert mentions["file"].include?("TP53")
28
+ mentions["file"].each do |mention|
29
+ assert_equal mention, text[mention.range].sub("\n", ' ')
30
+ end
23
31
  end
24
32
  end
25
33
 
@@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase
7
7
  This is a sentence.
8
8
  A funky character ™ in a sentence.
9
9
  This is a sentence.
10
- This is a
10
+ This is a broken
11
11
  sentence. This is
12
- another sentence.
12
+ another broken sentence.
13
13
  EOF
14
14
 
15
- assert_equal "This is a \nsentence.", NLP.geniass_sentence_splitter(text)[3]
15
+ iii NLP.geniass_sentence_splitter(text)
16
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
16
17
  end
17
18
 
19
+ def test_sentences_2
20
+ text =<<-EOF
21
+ This is a sentence.
22
+ This is a sentence.
23
+ This is a broken
24
+ sentence. This is
25
+ another broken sentence.
26
+ EOF
27
+
28
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
29
+ end
30
+
31
+ def test_sentences_ext
32
+ text =<<-EOF
33
+ This is a sentence.
34
+ This is a sentence.
35
+ This is a broken
36
+ sentence. This is
37
+ another broken sentence.
38
+ EOF
39
+
40
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
41
+ end
18
42
  end
19
43
 
@@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase
12
12
  segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
13
13
  annotation = SegmentAnnotation.setup(segment, :type => :verb)
14
14
 
15
- assert_equal 'verb', annotation.annotid.split(":").last
15
+ assert_equal 'verb', annotation.annotid.split(":")[5]
16
16
 
17
17
  annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
18
- assert_equal 'verb', annotation.annotid.split(":").last
18
+ assert_equal 'verb', annotation.annotid.split(":")[5]
19
19
  end
20
20
 
21
21
  def test_annotid
22
22
  text = "This is a document"
23
23
  Document.setup(text, "TEST", "test_doc1", nil)
24
24
 
25
- corpus = {}
26
- corpus.extend Document::Corpus
25
+ corpus = Document::Corpus.setup({})
27
26
 
28
27
  corpus.add_document(text)
29
28
 
@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
2
2
  require 'rbbt/segment/encoding'
3
3
 
4
4
  class TestEncoding < Test::Unit::TestCase
5
- def _test_bad_chars
5
+ def test_bad_chars
6
6
  text = "A funky character ™ in a sentence."
7
7
 
8
8
  assert_equal ["™"], Segment.bad_chars(text)
@@ -22,12 +22,13 @@ class TestClass < Test::Unit::TestCase
22
22
  assert_equal "SCORE", a.score
23
23
  end
24
24
 
25
- def __test_tsv
25
+ def test_tsv
26
26
  a = "test"
27
27
  NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
28
- assert Segment.tsv([a]).fields.include? "code"
29
- assert Segment.tsv([a], nil).fields.include? "code"
30
- assert Segment.tsv([a], "literal").fields.include? "code"
28
+ assert Annotated.tsv([a]).fields.include? "code"
29
+ assert Annotated.tsv([a], nil).fields.include? "code"
30
+ assert Annotated.tsv([a], :all).fields.include? "code"
31
+ assert Annotated.tsv([a], :all).fields.include? "literal"
31
32
  end
32
33
 
33
34
  def __test_segment_brat
@@ -9,8 +9,7 @@ class TestRangeIndex < Test::Unit::TestCase
9
9
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
10
10
  Document.setup(text, "TEST", "test_doc1", nil)
11
11
 
12
- corpus = {}
13
- corpus.extend Document::Corpus
12
+ corpus = Document::Corpus.setup({})
14
13
 
15
14
  corpus.add_document(text)
16
15
 
@@ -17,8 +17,7 @@ class TestSegment < Test::Unit::TestCase
17
17
  text = "This is a document"
18
18
  Document.setup(text, "TEST", "test_doc1", nil)
19
19
 
20
- corpus = {}
21
- corpus.extend Document::Corpus
20
+ corpus = Document::Corpus.setup({})
22
21
 
23
22
  corpus.add_document(text)
24
23
 
@@ -41,8 +40,7 @@ class TestSegment < Test::Unit::TestCase
41
40
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
42
41
  Document.setup(text, "TEST", "test_doc1", nil)
43
42
 
44
- corpus = {}
45
- corpus.extend Document::Corpus
43
+ corpus = Document::Corpus.setup({})
46
44
 
47
45
  corpus.add_document(text)
48
46
 
@@ -65,8 +63,7 @@ class TestSegment < Test::Unit::TestCase
65
63
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
66
64
  Document.setup(text, "TEST", "test_doc1", nil)
67
65
 
68
- corpus = {}
69
- corpus.extend Document::Corpus
66
+ corpus = Document::Corpus.setup({})
70
67
 
71
68
  corpus.add_document(text)
72
69
 
@@ -94,8 +91,7 @@ class TestSegment < Test::Unit::TestCase
94
91
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
95
92
  Document.setup(text, "TEST", "test_doc1", nil)
96
93
 
97
- corpus = {}
98
- corpus.extend Document::Corpus
94
+ corpus = Document::Corpus.setup({})
99
95
 
100
96
  corpus.add_document(text)
101
97
 
@@ -142,8 +138,7 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
142
138
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
143
139
  Document.setup(text, "TEST", "test_doc1", nil)
144
140
 
145
- corpus = {}
146
- corpus.extend Document::Corpus
141
+ corpus = Document::Corpus.setup({})
147
142
 
148
143
  corpus.add_document(text)
149
144
 
@@ -0,0 +1,32 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
2
+ require 'rbbt/nlp/spaCy'
3
+ require 'rbbt/document/corpus'
4
+
5
+ class TestSpaCy < Test::Unit::TestCase
6
+ def _test_tokens
7
+ text = "I tell a story"
8
+
9
+ tokens = SpaCy.tokens(text)
10
+
11
+ assert_equal 4, tokens.length
12
+ assert_equal "tell", tokens[1].to_s
13
+ end
14
+
15
+ def test_segments
16
+ text = "I tell a story. It's a very good story."
17
+
18
+ corpus = Document::Corpus.setup({})
19
+
20
+ Document.setup(text, "TEST", "test_doc1", "simple_sentence")
21
+
22
+ corpus.add_document text
23
+ text.corpus = corpus
24
+
25
+ segments = SpaCy.segments(text)
26
+
27
+ segments.each do |segment|
28
+ assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
29
+ end
30
+ end
31
+ end
32
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.1
4
+ version: 1.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-19 00:00:00.000000000 Z
11
+ date: 2020-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -102,12 +102,14 @@ files:
102
102
  - lib/rbbt/nlp/genia/sentence_splitter.rb
103
103
  - lib/rbbt/nlp/nlp.rb
104
104
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
105
+ - lib/rbbt/nlp/spaCy.rb
105
106
  - lib/rbbt/segment.rb
106
107
  - lib/rbbt/segment/annotation.rb
107
108
  - lib/rbbt/segment/encoding.rb
108
109
  - lib/rbbt/segment/named_entity.rb
109
110
  - lib/rbbt/segment/overlaps.rb
110
111
  - lib/rbbt/segment/range_index.rb
112
+ - lib/rbbt/segment/relationship.rb
111
113
  - lib/rbbt/segment/segmented.rb
112
114
  - lib/rbbt/segment/token.rb
113
115
  - lib/rbbt/segment/transformed.rb
@@ -161,6 +163,7 @@ files:
161
163
  - test/rbbt/test_document.rb
162
164
  - test/rbbt/test_segment.rb
163
165
  - test/test_helper.rb
166
+ - test/test_spaCy.rb
164
167
  homepage: http://github.com/mikisvaz/rbbt-util
165
168
  licenses: []
166
169
  metadata: {}
@@ -217,4 +220,5 @@ test_files:
217
220
  - test/rbbt/segment/test_encoding.rb
218
221
  - test/rbbt/segment/test_range_index.rb
219
222
  - test/rbbt/segment/test_corpus.rb
223
+ - test/test_spaCy.rb
220
224
  - test/test_helper.rb