rbbt-text 1.1.9 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +56 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +61 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +42 -12
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -361
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -355
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -52
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,12 +1,12 @@
1
1
  require 'rjb'
2
2
  require 'rbbt'
3
- require 'rbbt/text/segment/named_entity'
3
+ require 'rbbt/segment/named_entity'
4
4
 
5
5
  module Linnaeus
6
6
 
7
7
  Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
8
8
 
9
- ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
9
+ ARGS = ["--properties", Rbbt.software.opt.Linnaeus.produce["species-proxy/properties.conf"].find]
10
10
 
11
11
 
12
12
  Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx2G']) unless Rjb.loaded?
@@ -31,7 +31,7 @@ module Linnaeus
31
31
  init unless defined? @@Matcher
32
32
 
33
33
  @@Matcher.match(text).toArray().collect do |mention|
34
- NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
34
+ NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
35
35
  end
36
36
  end
37
37
  end
@@ -1,8 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/misc'
3
3
  require 'rbbt/tsv'
4
- require 'rbbt/text/segment'
5
- require 'rbbt/text/segment/token'
4
+ require 'rbbt/segment'
5
+ require 'rbbt/segment/token'
6
6
  require 'rbbt/ner/NER'
7
7
  require 'inline'
8
8
 
@@ -150,7 +150,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
150
150
 
151
151
  def match(text)
152
152
  matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
153
- NamedEntity.setup(name, offset, type, code)
153
+ NamedEntity.setup(name, :offset => offset, :entity_type => type, :code => code)
154
154
  }
155
155
 
156
156
  if case_insensitive
@@ -1,7 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/text/segment'
5
4
  require 'rbbt/ner/NER'
6
5
  require 'rbbt/util/log'
7
6
 
@@ -53,7 +52,7 @@ class OSCAR3 < NER
53
52
  next unless type.nil? or type.include? mention_type
54
53
  score = memm ? entities.get(key).to_string.to_f : nil
55
54
 
56
- NamedEntity.setup mention, rstart.to_i + offset, mention_type, nil, score
55
+ NamedEntity.setup mention, :offset => rstart.to_i + offset, :entity_type => mention_type, :score => score
57
56
 
58
57
  mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
59
58
  end
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/text/segment'
4
+ require 'rbbt/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
@@ -25,7 +25,7 @@ class OSCAR4 < NER
25
25
  @@tagger ||= @@OSCAR.new()
26
26
  end
27
27
 
28
- def self.match(text, type = nil)
28
+ def self.match(text, protect = false, type = nil)
29
29
  self.init
30
30
 
31
31
  return [] if text.nil? or text.strip.empty?
@@ -46,7 +46,7 @@ class OSCAR4 < NER
46
46
 
47
47
  next unless entity.getType.toString == type unless type.nil?
48
48
 
49
- NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
49
+ NamedEntity.setup mention, :offset => entity.getStart, :entity_type => entity.getType, :code => inchi, :score => entity.getConfidence
50
50
 
51
51
  result << mention
52
52
  end
@@ -1,7 +1,7 @@
1
- require 'rbbt/text/segment/named_entity'
2
- require 'rbbt/text/segment/segmented'
3
- require 'rbbt/text/segment/transformed'
4
- require 'rbbt/text/segment/relationship'
1
+ require 'rbbt/segment/named_entity'
2
+ require 'rbbt/segment/segmented'
3
+ require 'rbbt/segment/transformed'
4
+ #require 'rbbt/segment/relationship'
5
5
  require 'rbbt/ner/regexpNER'
6
6
  require 'rbbt/ner/token_trieNER'
7
7
  require 'rbbt/nlp/nlp'
@@ -14,7 +14,7 @@ class PatternRelExt
14
14
  regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
15
15
  segments = sentence.segments
16
16
  segments = segments.values.flatten if Hash === segments
17
- Transformed.with_transform(sentence, segments, Proc.new{|s| s.type.to_s.upcase}) do |sentence|
17
+ Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
18
18
  regexpNER.entities(sentence)
19
19
  end
20
20
  end
@@ -1,4 +1,3 @@
1
- require 'rbbt/text/segment'
2
1
  require 'rbbt/ner/NER'
3
2
  require 'rbbt/util/simpleDSL'
4
3
 
@@ -23,7 +22,7 @@ class RegExpNER < NER
23
22
  end
24
23
 
25
24
  if match and not match.empty?
26
- NamedEntity.setup(match, start + pre.length, type)
25
+ NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
27
26
  matches << match
28
27
  end
29
28
 
@@ -1,46 +1,58 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/tsv'
3
- require 'rbbt/text/segment'
4
- require 'rbbt/text/segment/token'
3
+ require 'rbbt/segment'
5
4
  require 'rbbt/ner/NER'
5
+ require 'rbbt/segment/token'
6
6
 
7
7
  class TokenTrieNER < NER
8
- def self.clean(token)
8
+ def self.clean(token, stem = false)
9
9
  if token.length > 3
10
- token.downcase.sub(/-/,'')
10
+ upcase = token !~ /[a-z]/
11
+ token = token.downcase.sub(/-/,'')
12
+
13
+ if stem && ! upcase
14
+ require 'stemmer'
15
+ if stem == :double
16
+ token = token.stem.stem
17
+ else
18
+ token = token.stem
19
+ end
20
+ end
21
+
22
+ token
11
23
  else
12
24
  token
13
25
  end
14
26
  end
15
27
 
16
- def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
28
+ def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
17
29
  if no_clean
18
30
  if extend_to_token
19
- Token.setup(clean(token), start, token)
31
+ Token.setup(token, :offset => start, :original => token)
20
32
  else
21
33
  token
22
34
  end
23
35
  else
24
36
  if extend_to_token
25
- Token.setup(clean(token), start, token)
37
+ Token.setup(clean(token, stem), :offset => start, :original => token)
26
38
  else
27
- clean(token)
39
+ clean(token, stem)
28
40
  end
29
41
  end
30
42
  end
31
43
 
32
- def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
33
- split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
44
+ def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
45
+ split_at = /\s|(\(|\)|[-."':,;])/ if split_at.nil?
34
46
 
35
47
  tokens = []
36
48
  while matchdata = text.match(split_at)
37
- tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
38
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
49
+ tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
50
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
39
51
  start += matchdata.end(0)
40
52
  text = matchdata.post_match
41
53
  end
42
54
 
43
- tokens << prepare_token(text, start, extend_to_token) unless text.empty?
55
+ tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
44
56
 
45
57
  tokens
46
58
  end
@@ -130,14 +142,14 @@ class TokenTrieNER < NER
130
142
  index1
131
143
  end
132
144
 
133
- def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
145
+ def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
134
146
 
135
147
  chunk_size = hash.size / 100
136
148
  items_in_chunk = 0
137
149
  tmp_index = {}
138
150
  hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
139
151
  names = Array === names ? names : [names]
140
- names.flatten! if Array === names.first and not Token === names.first.first
152
+ names.flatten! if Array === names.first and not Segment === names.first.first
141
153
 
142
154
  if names.empty?
143
155
  names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
146
158
  names.each do |name|
147
159
  next if name.empty? or (String === name and name.length < 2)
148
160
 
149
- tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
161
+ tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
150
162
  tokens.extend EnumeratedArray
151
163
 
152
164
  token_index = index_for_tokens(tokens, code, type, slack)
@@ -237,10 +249,10 @@ class TokenTrieNER < NER
237
249
  match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
238
250
  }
239
251
 
240
- NamedEntity.setup(match, match_tokens.first.offset, type, codes)
252
+ NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
241
253
  end
242
254
 
243
- attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
255
+ attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
244
256
  def initialize(type = nil, file = nil, options = {})
245
257
  options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
246
258
  :persist => false
@@ -248,6 +260,7 @@ class TokenTrieNER < NER
248
260
  @longest_match = options.delete :longest_match
249
261
  @split_at = options.delete :split_at
250
262
  @no_clean = options.delete :no_clean
263
+ @stem = options.delete :stem
251
264
 
252
265
  file = [] if file.nil?
253
266
  file = [file] unless Array === file
@@ -273,7 +286,7 @@ class TokenTrieNER < NER
273
286
  Log.debug "TokenTrieNER merging TSV"
274
287
  new.with_unnamed do
275
288
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
276
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
289
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
277
290
  end
278
291
  end
279
292
  when Hash === new
@@ -284,18 +297,18 @@ class TokenTrieNER < NER
284
297
  new = TSV.open(new, :flat)
285
298
  new.with_unnamed do
286
299
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
287
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
300
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
288
301
  end
289
302
  end
290
303
  end
291
304
  end
292
305
 
293
306
  def match(text)
294
- tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
307
+ tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
295
308
 
296
309
  tokens.extend EnumeratedArray
297
310
  tokens.pos = 0
298
-
311
+
299
312
  matches = []
300
313
  while tokens.left?
301
314
  new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/nlp/nlp'
2
- require 'rbbt/text/segment'
2
+ require 'rbbt/segment'
3
3
  module NLP
4
4
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
5
 
@@ -239,6 +239,7 @@ module NLP
239
239
  end
240
240
 
241
241
  def self.geniass_sentence_splitter(text)
242
+ Rbbt.software.opt.Geniass.produce
242
243
  offsets = []
243
244
 
244
245
  cleaned = text.gsub("\n",NEW_LINE_MASK)
@@ -294,7 +295,7 @@ module NLP
294
295
  offsets.collect do |s,e|
295
296
  sentence = text[s..e]
296
297
  next if sentence.nil?
297
- #sentence.gsub!(NEW_LINE_MASK, "\n")
298
+ sentence.gsub!(NEW_LINE_MASK, "\n")
298
299
  Segment.setup sentence, s
299
300
  sentence
300
301
  end
@@ -2,8 +2,8 @@ require 'rbbt'
2
2
  require 'rbbt/util/tmpfile'
3
3
  require 'rbbt/persist'
4
4
  require 'rbbt/resource'
5
- require 'rbbt/text/segment'
6
- require 'rbbt/text/segment/segmented'
5
+ require 'rbbt/segment'
6
+ require 'rbbt/segment/segmented'
7
7
  require 'rbbt/nlp/genia/sentence_splitter'
8
8
  require 'digest/md5'
9
9
 
@@ -101,7 +101,7 @@ module NLP
101
101
  input = sentences.collect{|sentence| sentence.gsub(/\n/, NEW_LINE_MASK)} * "\n"
102
102
  sentence_tokens = TmpFile.with_file(input) do |fin|
103
103
  out = local_persist(Digest::MD5.hexdigest(input), :Chunks, :string) do
104
- CMD.cmd("cd #{Rbbt.software.opt.Gdep.find}; ./gdep #{ fin }").read
104
+ CMD.cmd("cd #{Rbbt.software.opt.Gdep.produce.find}; ./gdep #{ fin }").read
105
105
  end
106
106
 
107
107
  out.split(/^$/).collect do |sentence|
@@ -120,10 +120,10 @@ module NLP
120
120
 
121
121
 
122
122
  def self.gdep_parse_sentences_extension(sentences)
123
- require Rbbt.software.opt.Gdep.ruby["Gdep.so"].find
123
+ require Rbbt.software.opt.Gdep.produce.ruby["Gdep.so"].find
124
124
  gdep = Gdep.new
125
125
  if not gdep.gdep_is_loaded
126
- Misc.in_dir Rbbt.software.opt.Gdep.find do
126
+ Misc.in_dir Rbbt.software.opt.Gdep.produce.find do
127
127
  gdep.load_gdep
128
128
  end
129
129
  end
@@ -1,6 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/text/segment'
3
+ require 'rbbt/segment'
4
4
  require 'rbbt/resource'
5
5
 
6
6
  module OpenNLP
@@ -33,48 +33,49 @@ module OpenNLP
33
33
  def self.sentence_splitter(text)
34
34
  return [] if text.nil? or text.empty?
35
35
 
36
- text = Misc.to_utf8(text)
37
- last = 0
38
- begin
39
- sentence_split_detector = self.sentence_split_detector
40
-
41
- sentences = nil
42
- TmpFile.with_file do |tmpfile|
43
- start_time = Time.now
44
-
45
- begin
46
- pid = Process.fork do
47
- sent = sentence_split_detector.sentDetect(text)
48
- Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
49
- end
36
+ Segment.ascii(text, "?") do
37
+ last = 0
38
+ begin
39
+ sentence_split_detector = self.sentence_split_detector
50
40
 
51
- while not Process.waitpid(pid)
52
- if Time.now - start_time > MAX
53
- Process.kill(9, pid)
54
- raise "Taking to long (> #{MAX} seconds)"
55
- end
56
- sleep 0.1
57
- end
41
+ sentences = nil
42
+ TmpFile.with_file do |tmpfile|
43
+ start_time = Time.now
58
44
 
59
45
  begin
60
- Process.waitpid(pid)
46
+ pid = Process.fork do
47
+ sent = sentence_split_detector.sentDetect(text)
48
+ Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
49
+ end
50
+
51
+ while not Process.waitpid(pid)
52
+ if Time.now - start_time > MAX
53
+ Process.kill(9, pid)
54
+ raise "Taking to long (> #{MAX} seconds)"
55
+ end
56
+ sleep 0.1
57
+ end
58
+
59
+ begin
60
+ Process.waitpid(pid)
61
+ end
62
+ rescue Errno::ECHILD
61
63
  end
62
- rescue Errno::ECHILD
64
+
65
+ sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
63
66
  end
64
67
 
65
- sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
68
+ sentences.collect{|sentence|
69
+ sentence = Misc.to_utf8(sentence)
70
+ start = text.index(sentence, last)
71
+ Segment.setup sentence, start
72
+ last = start + sentence.length - 1
73
+ sentence
74
+ }
75
+ rescue Exception
76
+ raise $!
77
+ raise "Sentence splitter raised exception: #{$!.message}"
66
78
  end
67
-
68
- sentences.collect{|sentence|
69
- sentence = Misc.to_utf8(sentence)
70
- start = text.index(sentence, last)
71
- Segment.setup sentence, start
72
- last = start + sentence.length - 1
73
- sentence
74
- }
75
- rescue Exception
76
- raise $!
77
- raise "Sentence splitter raised exception: #{$!.message}"
78
79
  end
79
80
  end
80
81
  end
@@ -0,0 +1,52 @@
1
+ require 'rbbt/segment'
2
+ require 'rbbt/document'
3
+ require 'rbbt/segment/annotation'
4
+ require 'rbbt/util/python'
5
+
6
+ module SpaCy
7
+
8
+ PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
9
+
10
+ def self.tokens(text, lang = 'en')
11
+
12
+ tokens = []
13
+ RbbtPython.run 'spacy' do
14
+ nlp = spacy.load(lang)
15
+ doc = nlp.call(text)
16
+ doc.__len__.times do |i|
17
+ tokens << doc.__getitem__(i)
18
+ end
19
+ end
20
+ tokens
21
+ end
22
+
23
+ def self.segments(text, lang = 'en')
24
+ docid = text.docid if Document === text
25
+ corpus = text.corpus if Document === text
26
+ tokens = self.tokens(text, lang).collect do |token|
27
+ info = {}
28
+ PROPERTIES.each do |p|
29
+ info[p] = token.instance_eval(p.to_s)
30
+ end
31
+ info[:type] = "SpaCy"
32
+ info[:offset] = token.idx
33
+ info[:dep] = token.dep_ + "->" + token.head.idx.to_s
34
+ info[:docid] = docid if docid
35
+ info[:corpus] = corpus if corpus
36
+ SpaCyToken.setup(token.text, info)
37
+ end
38
+ SpaCyToken.setup(tokens, :corpus => corpus)
39
+ end
40
+ end
41
+
42
+ module SpaCyToken
43
+ extend Entity
44
+ include SegmentAnnotation
45
+
46
+ self.annotation *SpaCy::PROPERTIES
47
+ self.annotation :dep
48
+ end
49
+
50
+ if __FILE__ == $0
51
+ ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
52
+ end