rbbt-text 1.1.9 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 77391b4691e4ea2a6e5da918bc40820bae8175ff1d82f9c96a1685986605dfd7
4
- data.tar.gz: a83dd9236502d1787f1040fb4c60a6160086515713282283e434b589c1425743
3
+ metadata.gz: '009cfce2ce954c03db5c09d0bd6f5d25bf59d508776d7370bb6bd0fb3a135f36'
4
+ data.tar.gz: 3d11d2a5934512958d10dbdfad5e22a9a2481b332c985ab1e2c8e92427d6f375
5
5
  SHA512:
6
- metadata.gz: f69d7eb10741d2b3c7735e8e29f29625567775647d16d0261b42cce108d2f8309a2e938dad3360842a964a9c5d4fd5a2197c72618ab40971f7a65306e9c6936a
7
- data.tar.gz: dec802a15cfc7c8c9a90ee8ec0c83af88c881ee16e071776a995554aa0661603bdd6cb7bf30162c43beccf1a423a2e8d26afc15f92544ccc08284a87a038a1b2
6
+ metadata.gz: e9338d4b54d2b66efda11dee3d37366c4f4ae78bde80f0abc1016b34c928e1db857ad73f33ba1da611ad232513498430736c46134a902b3930a8f832afed3e09
7
+ data.tar.gz: 0cdeeee67636d4e0b0714334b3c187cb0f5ea5c7363fe27fc84d438643a0d6f204413a4dd5d99c8c43d847539320c484fde2b5300b298cf9cc782148d98802ee
@@ -10,35 +10,39 @@ module GNormPlus
10
10
  end
11
11
 
12
12
  CONFIG =<<-EOF
13
-
14
13
  #===Annotation
15
14
  #Attribution setting:
16
15
  #FocusSpecies = Taxonomy ID
17
- # All: All species
18
- # 9606: Human
19
- # 4932: yeast
20
- # 7227: Fly
21
- # 10090: Mouse
22
- # 10116: Rat
23
- # 7955: Zebrafish
24
- # 3702: Arabidopsis thaliana
16
+ # All: All species
17
+ # 9606: Human
18
+ # 4932: yeast
19
+ # 7227: Fly
20
+ # 10090: Mouse
21
+ # 10116: Rat
22
+ # 7955: Zebrafish
23
+ # 3702: Arabidopsis thaliana
25
24
  #open: True
26
25
  #close: False
27
26
 
28
27
  [Focus Species]
29
- FocusSpecies = All
28
+ FocusSpecies = 9606
29
+ FilterAntibody = False
30
30
  [Dictionary & Model]
31
31
  DictionaryFolder = ./Dictionary
32
32
  GNRModel = ./Dictionary/GNR.Model
33
33
  SCModel = ./Dictionary/SimConcept.Model
34
34
  GeneIDMatch = True
35
+ HomologeneID = False
35
36
  Normalization2Protein = False
37
+ ShowUnNormalizedMention = False
36
38
  DeleteTmp = True
39
+ IgnoreNER = True
37
40
  EOF
38
41
 
39
42
  def self.process(texts)
40
43
  TmpFile.with_file do |tmpdir|
41
44
  Open.mkdir tmpdir
45
+
42
46
  Misc.in_dir tmpdir do
43
47
  Open.ln_s Rbbt.software.opt.GNormPlus.Dictionary.find, '.'
44
48
  Open.ln_s Rbbt.software.opt.GNormPlus["BioC.dtd"].find, '.'
@@ -50,12 +54,12 @@ EOF
50
54
 
51
55
  texts.each do |name,text|
52
56
  Open.write("input/#{name}.txt") do |f|
53
- f.puts "#{name}|a|" << text
57
+ f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
54
58
  f.puts
55
59
  end
56
60
  end
57
61
  Open.write('config', CONFIG)
58
- CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.find}/GNormPlus.jar' 'input' 'output' 'config'")
62
+ CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
59
63
 
60
64
  if texts.respond_to? :key_field
61
65
  key_field = texts.key_field
@@ -68,6 +72,9 @@ EOF
68
72
  entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
69
73
  tsv[name] = entities
70
74
  end
75
+
76
+ raise "GNormPlus failed: no results found" if tsv.size == 0 && texts.size > 0
77
+
71
78
  tsv
72
79
  end
73
80
  end
@@ -1,5 +1,6 @@
1
1
  require 'rbbt/text/segment'
2
2
  require 'rbbt/text/segment/segmented'
3
+ require 'rbbt/text/segment/docid'
3
4
  require 'rbbt/tsv'
4
5
  require 'rbbt/resource/path'
5
6
  require 'rbbt/persist/tsv'
@@ -15,6 +16,7 @@ class Corpus
15
16
  attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
16
17
 
17
18
  attr_accessor :multiple_result
19
+
18
20
  def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
19
21
  @segments = {}
20
22
  @segment_indices = {}
@@ -44,16 +46,22 @@ class Corpus
44
46
  end
45
47
 
46
48
  def self.define(entity, &block)
47
- send :define_method, "produce_#{entity}", &block
49
+ send :define_method, "produce_#{entity}" do
50
+ segments = self.instance_exec &block
48
51
 
49
- self.class_eval <<-EOC, __FILE__, __LINE__
52
+ segments.each{|s| s.docid = docid }
53
+ end
54
+
55
+ self.class_eval <<-EOC, __FILE__, __LINE__ + 1
50
56
  def load_#{entity}(raw = false)
51
57
  return if segments.include? "#{ entity }"
52
58
  if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
53
- segments["#{entity}"] = load_with_persistence_#{entity}(raw)
59
+ entities = load_with_persistence_#{entity}(raw)
54
60
  else
55
- segments["#{ entity }"] = produce_#{entity}
61
+ entities = produce_#{entity}
56
62
  end
63
+
64
+ segments["#{ entity }"] = entities
57
65
  end
58
66
 
59
67
  def #{entity}(raw = false)
@@ -77,7 +85,10 @@ class Corpus
77
85
 
78
86
  def self.define_multiple(entity, &block)
79
87
  send :define_method, "produce_#{entity}" do
80
- return self.multiple_result[entity] if self.multiple_result && self.multiple_result[entity]
88
+ if self.multiple_result && self.multiple_result[entity]
89
+ segments = self.multiple_result[entity]
90
+ return segments.each{|s| s.docid = docid }
91
+ end
81
92
  raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
82
93
  end
83
94
 
@@ -86,14 +97,16 @@ class Corpus
86
97
  self
87
98
  end.send :define_method, name, &block
88
99
 
89
- self.class_eval <<-EOC, __FILE__, __LINE__
100
+ self.class_eval <<-EOC, __FILE__, __LINE__ + 1
90
101
  def load_#{entity}(raw = false)
91
102
  return if segments.include? "#{ entity }"
92
103
  if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
93
- segments["#{entity}"] = load_with_persistence_#{entity}(raw)
104
+ entities = load_with_persistence_#{entity}(raw)
94
105
  else
95
- segments["#{ entity }"] = produce_#{entity}
106
+ entities = produce_#{entity}
96
107
  end
108
+
109
+ segments["#{ entity }"] = entities
97
110
  end
98
111
 
99
112
  def #{entity}(raw = false)
@@ -124,7 +137,7 @@ class Corpus
124
137
  missing << doc
125
138
  end
126
139
  end
127
- res = self.send("multiple_produce_#{entity.to_s}", missing)
140
+ res = self.send("multiple_produce_#{entity.to_s}", missing) if missing.any?
128
141
  case res
129
142
  when Array
130
143
  res.each_with_index do |res,i|
@@ -142,7 +155,9 @@ class Corpus
142
155
  end
143
156
  end
144
157
  end
145
- missing.each{|doc| doc.send entity }
158
+ missing.each{|doc|
159
+ doc.send entity
160
+ }
146
161
  end
147
162
 
148
163
 
@@ -197,7 +212,7 @@ class Corpus
197
212
  FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
198
213
  end
199
214
 
200
- self.class_eval <<-EOC, __FILE__, __LINE__
215
+ self.class_eval <<-EOC, __FILE__, __LINE__ + 1
201
216
  def load_with_persistence_#{entity}(raw = false)
202
217
  repo = TSV_REPOS["#{ entity }"]
203
218
  if repo.nil?
@@ -253,7 +268,7 @@ class Corpus
253
268
 
254
269
  FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
255
270
 
256
- self.class_eval <<-EOC, __FILE__, __LINE__
271
+ self.class_eval <<-EOC, __FILE__, __LINE__ + 1
257
272
  def load_with_persistence_#{entity}(raw = false)
258
273
  fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
259
274
 
@@ -261,20 +276,23 @@ class Corpus
261
276
 
262
277
  begin
263
278
 
264
- data.read true
279
+ if data.respond_to? :persistence_path and String === data.persistence_path
280
+ data.filter(data.persistence_path + '.filters')
281
+ end
282
+
283
+ keys = data.read_and_close do
265
284
 
266
- fields = data.fields if fields.nil? and data.respond_to? :fields
285
+ fields = data.fields if fields.nil? and data.respond_to? :fields
267
286
 
287
+ data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
288
+ data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
289
+ keys = data.keys
290
+ data.pop_filter if fields.include?("#{entity_field}")
291
+ data.pop_filter if fields.include?("#{doc_field}")
268
292
 
269
- if data.respond_to? :persistence_path and String === data.persistence_path
270
- data.filter(data.persistence_path + '.filters')
293
+ keys
271
294
  end
272
295
 
273
- data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
274
- data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
275
- keys = data.keys
276
- data.pop_filter if data.fields.include?("#{entity_field}")
277
- data.pop_filter if data.fields.include?("#{doc_field}")
278
296
 
279
297
  if keys.empty?
280
298
  segments = produce_#{entity}
@@ -289,34 +307,38 @@ class Corpus
289
307
  "#{ entity }"
290
308
  end
291
309
 
292
- data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
293
- data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
294
- data.write true
295
- keys = tsv.collect do |key, value|
296
- data[key] = value
297
- key
310
+ keys = data.write_and_close do
311
+ data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
312
+ data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
313
+ keys = tsv.collect do |key, value|
314
+ data[key] = value
315
+ key
316
+ end
317
+ data.pop_filter if fields.include?("#{entity_field}")
318
+ data.pop_filter if fields.include?("#{doc_field}")
319
+ keys
298
320
  end
299
- data.pop_filter if data.fields.include?("#{entity_field}")
300
- data.pop_filter if data.fields.include?("#{doc_field}")
301
- data.read
302
321
 
303
322
  else
304
- if raw == :check
305
- data.close
306
- return nil
307
- end
323
+ return nil if raw == :check
308
324
  end
309
325
 
310
326
  return data.values if raw
311
327
 
312
328
  start_pos = data.identify_field "Start"
313
- segments = data.values_at(*keys).collect{|annotation|
329
+ data.read_and_close do
330
+ data.chunked_values_at(keys).collect{|annotation|
331
+ begin
314
332
  pos = annotation[start_pos]
315
- Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
316
- }.compact
317
- data.close
318
-
319
- segments
333
+ Segment.load_tsv_values(text, annotation, fields) unless [-1, "-1", [-1], ["-1"]].include?(pos)
334
+ rescue
335
+ Log.exception $!
336
+ iif keys
337
+ iif [text, annotation]
338
+ end
339
+
340
+ }.compact
341
+ end
320
342
  ensure
321
343
  data.close
322
344
  end
@@ -348,7 +370,7 @@ class Corpus
348
370
  segment.segments[name] = annotations
349
371
  class << segment
350
372
  self
351
- end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__
373
+ end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__ + 1
352
374
  end
353
375
 
354
376
  segment
@@ -3,7 +3,7 @@ require 'rbbt/fix_width_table'
3
3
 
4
4
  module Segment
5
5
  extend Annotation
6
- self.annotation :offset
6
+ self.annotation :offset, :docid
7
7
 
8
8
  def segment_length
9
9
  begin
@@ -325,7 +325,7 @@ module Segment
325
325
  tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
326
326
 
327
327
  segments.each do |segment|
328
- tsv[segment.object_id.to_s] = self.tsv_values_for_segment(segment, fields)
328
+ tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
329
329
  end
330
330
 
331
331
  tsv
@@ -348,6 +348,14 @@ module Segment
348
348
  [offset, self.end] * ".."
349
349
  end
350
350
 
351
+ def segment_id
352
+ if self.respond_to?(:docid)
353
+ [docid, locus, Misc.obj2digest(info)] * ":"
354
+ else
355
+ Misc.obj2digest(info)
356
+ end
357
+ end
358
+
351
359
  #def ==(other)
352
360
  # self.text == other.text
353
361
  #end
@@ -1,46 +1,46 @@
1
1
  require 'rbbt/text/segment'
2
2
 
3
- module SegmentWithDocid
4
- extend Annotation
5
-
6
- self.annotation :docid
7
-
8
- def masked?
9
- self[0..5] == "MASKED"
10
- end
11
-
12
- def mask
13
- return self if masked?
14
- raise "Cannot mask an array of elements, they must be masked individually" if Array === self
15
- raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
16
- raise "Cannot mask a segment with no offset" if offset.nil?
17
- textual_position = ["MASKED", length] * ":"
18
- self.replace(textual_position)
19
- self
20
- end
21
-
22
- def unmasked_text
23
- return self unless masked?
24
- tag, length = self.split(":")
25
- Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
26
- end
27
-
28
- def unmask
29
- return self unless masked?
30
- self.replace(unmasked_text)
31
- self
32
- end
33
-
34
- def str_length
35
- self.length
36
- end
37
-
38
- def masked_length
39
- self.split(":").last.to_i
40
- end
41
-
42
- def segment_length
43
- masked? ? masked_length : str_length
44
- end
45
- end
46
-
3
+ #module SegmentWithDocid
4
+ # extend Annotation
5
+ #
6
+ # self.annotation :docid
7
+ #
8
+ # def masked?
9
+ # self[0..5] == "MASKED"
10
+ # end
11
+ #
12
+ # def mask
13
+ # return self if masked?
14
+ # raise "Cannot mask an array of elements, they must be masked individually" if Array === self
15
+ # raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
16
+ # raise "Cannot mask a segment with no offset" if offset.nil?
17
+ # textual_position = ["MASKED", length] * ":"
18
+ # self.replace(textual_position)
19
+ # self
20
+ # end
21
+ #
22
+ # def unmasked_text
23
+ # return self unless masked?
24
+ # tag, length = self.split(":")
25
+ # Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
26
+ # end
27
+ #
28
+ # def unmask
29
+ # return self unless masked?
30
+ # self.replace(unmasked_text)
31
+ # self
32
+ # end
33
+ #
34
+ # def str_length
35
+ # self.length
36
+ # end
37
+ #
38
+ # def masked_length
39
+ # self.split(":").last.to_i
40
+ # end
41
+ #
42
+ # def segment_length
43
+ # masked? ? masked_length : str_length
44
+ # end
45
+ #end
46
+ #
@@ -29,6 +29,7 @@ Score: #{score.inspect}
29
29
  end
30
30
 
31
31
  def entity(params = nil)
32
+ code = self.dup
32
33
  format, entity = code.split(":")
33
34
  entity, format = format, nil if entity.nil?
34
35
 
@@ -111,10 +111,10 @@ module Transformed
111
111
 
112
112
  self[updated_begin..updated_end] = new
113
113
 
114
- @transformed_segments[segment.object_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
114
+ @transformed_segments[segment.segment_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
115
115
 
116
116
  segment.replace original_text
117
- stack << segment.object_id
117
+ stack << segment.segment_id
118
118
  end
119
119
  @transformation_stack << stack
120
120
  end
@@ -8,7 +8,6 @@ class TestGNormPlus < Test::Unit::TestCase
8
8
  We found that TP53 is regulated by MDM2 in Homo sapiens
9
9
  EOF
10
10
 
11
-
12
11
  mentions = GNormPlus.process({:file => text})
13
12
  Log.tsv mentions
14
13
  end
@@ -10,10 +10,6 @@ class TestCorpusDocument < Test::Unit::TestCase
10
10
  Segment.align(self.text, words)
11
11
  end
12
12
 
13
- Open.mkdir Rbbt.tmp.test.annotations.find
14
- Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
15
-
16
-
17
13
  Corpus::Document.define_multiple :words2 do |documents|
18
14
  documents.collect do |doc|
19
15
  words = doc.text.split(" ")
@@ -21,32 +17,66 @@ class TestCorpusDocument < Test::Unit::TestCase
21
17
  end
22
18
  end
23
19
 
20
+ Open.mkdir Rbbt.tmp.test.annotations.find
21
+
22
+ Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
24
23
  Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
25
24
  end
26
25
 
27
26
  def test_words
28
27
  text = "This is a test document"
29
- document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc", text)
28
+ document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", text)
30
29
  assert_equal Segment.sort(document.words), text.split(" ")
30
+ assert document.words.first.docid
31
+ assert document.words.first.segment_id.include?("TEST")
31
32
  end
32
33
 
33
34
  def test_words_multiple
34
35
  document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
35
- document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
36
+ document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
37
+ document3 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc3:3", "This is yet another test document")
36
38
 
37
- docs = [document1, document2]
39
+ docs = [document1, document2, document3]
38
40
 
39
41
  Corpus::Document.prepare_multiple(docs, :words2)
40
-
42
+
43
+ assert document1.words.first.docid
44
+ assert document1.words.first.segment_id.include?("TEST")
45
+
41
46
  assert_equal document1.words2, document1.text.split(" ")
42
47
  assert_equal document2.words2, document2.text.split(" ")
48
+ assert_equal document3.words2, document3.text.split(" ")
43
49
 
44
50
  document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
45
- document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
51
+ document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
46
52
 
47
53
  docs = [document1, document2]
48
54
 
49
55
  Corpus::Document.prepare_multiple(docs, :words2)
50
56
  end
57
+
58
+ def test_parallel
59
+ text =<<-EOF
60
+ This is a test document number
61
+ EOF
62
+
63
+ docs = []
64
+ 100.times do |i|
65
+ docs << text.chomp + " " + i.to_s
66
+ end
67
+
68
+ Log.with_severity 0 do
69
+ TSV.traverse docs, :cpus => 10, :bar => true do |doc|
70
+ hash = Misc.digest(doc)
71
+ document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
72
+ assert_equal Segment.sort(document.words), document.text.split(" ")
73
+ end
74
+ TSV.traverse docs, :cpus => 10, :bar => true do |doc|
75
+ hash = Misc.digest(doc)
76
+ document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
77
+ assert_equal Segment.sort(document.words), document.text.split(" ")
78
+ end
79
+ end
80
+ end
51
81
  end
52
82
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.9
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-13 00:00:00.000000000 Z
11
+ date: 2020-04-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util