rbbt-text 1.1.9 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 77391b4691e4ea2a6e5da918bc40820bae8175ff1d82f9c96a1685986605dfd7
4
- data.tar.gz: a83dd9236502d1787f1040fb4c60a6160086515713282283e434b589c1425743
3
+ metadata.gz: '009cfce2ce954c03db5c09d0bd6f5d25bf59d508776d7370bb6bd0fb3a135f36'
4
+ data.tar.gz: 3d11d2a5934512958d10dbdfad5e22a9a2481b332c985ab1e2c8e92427d6f375
5
5
  SHA512:
6
- metadata.gz: f69d7eb10741d2b3c7735e8e29f29625567775647d16d0261b42cce108d2f8309a2e938dad3360842a964a9c5d4fd5a2197c72618ab40971f7a65306e9c6936a
7
- data.tar.gz: dec802a15cfc7c8c9a90ee8ec0c83af88c881ee16e071776a995554aa0661603bdd6cb7bf30162c43beccf1a423a2e8d26afc15f92544ccc08284a87a038a1b2
6
+ metadata.gz: e9338d4b54d2b66efda11dee3d37366c4f4ae78bde80f0abc1016b34c928e1db857ad73f33ba1da611ad232513498430736c46134a902b3930a8f832afed3e09
7
+ data.tar.gz: 0cdeeee67636d4e0b0714334b3c187cb0f5ea5c7363fe27fc84d438643a0d6f204413a4dd5d99c8c43d847539320c484fde2b5300b298cf9cc782148d98802ee
@@ -10,35 +10,39 @@ module GNormPlus
10
10
  end
11
11
 
12
12
  CONFIG =<<-EOF
13
-
14
13
  #===Annotation
15
14
  #Attribution setting:
16
15
  #FocusSpecies = Taxonomy ID
17
- # All: All species
18
- # 9606: Human
19
- # 4932: yeast
20
- # 7227: Fly
21
- # 10090: Mouse
22
- # 10116: Rat
23
- # 7955: Zebrafish
24
- # 3702: Arabidopsis thaliana
16
+ # All: All species
17
+ # 9606: Human
18
+ # 4932: yeast
19
+ # 7227: Fly
20
+ # 10090: Mouse
21
+ # 10116: Rat
22
+ # 7955: Zebrafish
23
+ # 3702: Arabidopsis thaliana
25
24
  #open: True
26
25
  #close: False
27
26
 
28
27
  [Focus Species]
29
- FocusSpecies = All
28
+ FocusSpecies = 9606
29
+ FilterAntibody = False
30
30
  [Dictionary & Model]
31
31
  DictionaryFolder = ./Dictionary
32
32
  GNRModel = ./Dictionary/GNR.Model
33
33
  SCModel = ./Dictionary/SimConcept.Model
34
34
  GeneIDMatch = True
35
+ HomologeneID = False
35
36
  Normalization2Protein = False
37
+ ShowUnNormalizedMention = False
36
38
  DeleteTmp = True
39
+ IgnoreNER = True
37
40
  EOF
38
41
 
39
42
  def self.process(texts)
40
43
  TmpFile.with_file do |tmpdir|
41
44
  Open.mkdir tmpdir
45
+
42
46
  Misc.in_dir tmpdir do
43
47
  Open.ln_s Rbbt.software.opt.GNormPlus.Dictionary.find, '.'
44
48
  Open.ln_s Rbbt.software.opt.GNormPlus["BioC.dtd"].find, '.'
@@ -50,12 +54,12 @@ EOF
50
54
 
51
55
  texts.each do |name,text|
52
56
  Open.write("input/#{name}.txt") do |f|
53
- f.puts "#{name}|a|" << text
57
+ f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
54
58
  f.puts
55
59
  end
56
60
  end
57
61
  Open.write('config', CONFIG)
58
- CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.find}/GNormPlus.jar' 'input' 'output' 'config'")
62
+ CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
59
63
 
60
64
  if texts.respond_to? :key_field
61
65
  key_field = texts.key_field
@@ -68,6 +72,9 @@ EOF
68
72
  entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
69
73
  tsv[name] = entities
70
74
  end
75
+
76
+ raise "GNormPlus failed: no results found" if tsv.size == 0 && texts.size > 0
77
+
71
78
  tsv
72
79
  end
73
80
  end
@@ -1,5 +1,6 @@
1
1
  require 'rbbt/text/segment'
2
2
  require 'rbbt/text/segment/segmented'
3
+ require 'rbbt/text/segment/docid'
3
4
  require 'rbbt/tsv'
4
5
  require 'rbbt/resource/path'
5
6
  require 'rbbt/persist/tsv'
@@ -15,6 +16,7 @@ class Corpus
15
16
  attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
16
17
 
17
18
  attr_accessor :multiple_result
19
+
18
20
  def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
19
21
  @segments = {}
20
22
  @segment_indices = {}
@@ -44,16 +46,22 @@ class Corpus
44
46
  end
45
47
 
46
48
  def self.define(entity, &block)
47
- send :define_method, "produce_#{entity}", &block
49
+ send :define_method, "produce_#{entity}" do
50
+ segments = self.instance_exec &block
48
51
 
49
- self.class_eval <<-EOC, __FILE__, __LINE__
52
+ segments.each{|s| s.docid = docid }
53
+ end
54
+
55
+ self.class_eval <<-EOC, __FILE__, __LINE__ + 1
50
56
  def load_#{entity}(raw = false)
51
57
  return if segments.include? "#{ entity }"
52
58
  if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
53
- segments["#{entity}"] = load_with_persistence_#{entity}(raw)
59
+ entities = load_with_persistence_#{entity}(raw)
54
60
  else
55
- segments["#{ entity }"] = produce_#{entity}
61
+ entities = produce_#{entity}
56
62
  end
63
+
64
+ segments["#{ entity }"] = entities
57
65
  end
58
66
 
59
67
  def #{entity}(raw = false)
@@ -77,7 +85,10 @@ class Corpus
77
85
 
78
86
  def self.define_multiple(entity, &block)
79
87
  send :define_method, "produce_#{entity}" do
80
- return self.multiple_result[entity] if self.multiple_result && self.multiple_result[entity]
88
+ if self.multiple_result && self.multiple_result[entity]
89
+ segments = self.multiple_result[entity]
90
+ return segments.each{|s| s.docid = docid }
91
+ end
81
92
  raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
82
93
  end
83
94
 
@@ -86,14 +97,16 @@ class Corpus
86
97
  self
87
98
  end.send :define_method, name, &block
88
99
 
89
- self.class_eval <<-EOC, __FILE__, __LINE__
100
+ self.class_eval <<-EOC, __FILE__, __LINE__ + 1
90
101
  def load_#{entity}(raw = false)
91
102
  return if segments.include? "#{ entity }"
92
103
  if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
93
- segments["#{entity}"] = load_with_persistence_#{entity}(raw)
104
+ entities = load_with_persistence_#{entity}(raw)
94
105
  else
95
- segments["#{ entity }"] = produce_#{entity}
106
+ entities = produce_#{entity}
96
107
  end
108
+
109
+ segments["#{ entity }"] = entities
97
110
  end
98
111
 
99
112
  def #{entity}(raw = false)
@@ -124,7 +137,7 @@ class Corpus
124
137
  missing << doc
125
138
  end
126
139
  end
127
- res = self.send("multiple_produce_#{entity.to_s}", missing)
140
+ res = self.send("multiple_produce_#{entity.to_s}", missing) if missing.any?
128
141
  case res
129
142
  when Array
130
143
  res.each_with_index do |res,i|
@@ -142,7 +155,9 @@ class Corpus
142
155
  end
143
156
  end
144
157
  end
145
- missing.each{|doc| doc.send entity }
158
+ missing.each{|doc|
159
+ doc.send entity
160
+ }
146
161
  end
147
162
 
148
163
 
@@ -197,7 +212,7 @@ class Corpus
197
212
  FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
198
213
  end
199
214
 
200
- self.class_eval <<-EOC, __FILE__, __LINE__
215
+ self.class_eval <<-EOC, __FILE__, __LINE__ + 1
201
216
  def load_with_persistence_#{entity}(raw = false)
202
217
  repo = TSV_REPOS["#{ entity }"]
203
218
  if repo.nil?
@@ -253,7 +268,7 @@ class Corpus
253
268
 
254
269
  FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
255
270
 
256
- self.class_eval <<-EOC, __FILE__, __LINE__
271
+ self.class_eval <<-EOC, __FILE__, __LINE__ + 1
257
272
  def load_with_persistence_#{entity}(raw = false)
258
273
  fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
259
274
 
@@ -261,20 +276,23 @@ class Corpus
261
276
 
262
277
  begin
263
278
 
264
- data.read true
279
+ if data.respond_to? :persistence_path and String === data.persistence_path
280
+ data.filter(data.persistence_path + '.filters')
281
+ end
282
+
283
+ keys = data.read_and_close do
265
284
 
266
- fields = data.fields if fields.nil? and data.respond_to? :fields
285
+ fields = data.fields if fields.nil? and data.respond_to? :fields
267
286
 
287
+ data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
288
+ data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
289
+ keys = data.keys
290
+ data.pop_filter if fields.include?("#{entity_field}")
291
+ data.pop_filter if fields.include?("#{doc_field}")
268
292
 
269
- if data.respond_to? :persistence_path and String === data.persistence_path
270
- data.filter(data.persistence_path + '.filters')
293
+ keys
271
294
  end
272
295
 
273
- data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
274
- data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
275
- keys = data.keys
276
- data.pop_filter if data.fields.include?("#{entity_field}")
277
- data.pop_filter if data.fields.include?("#{doc_field}")
278
296
 
279
297
  if keys.empty?
280
298
  segments = produce_#{entity}
@@ -289,34 +307,38 @@ class Corpus
289
307
  "#{ entity }"
290
308
  end
291
309
 
292
- data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
293
- data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
294
- data.write true
295
- keys = tsv.collect do |key, value|
296
- data[key] = value
297
- key
310
+ keys = data.write_and_close do
311
+ data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
312
+ data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
313
+ keys = tsv.collect do |key, value|
314
+ data[key] = value
315
+ key
316
+ end
317
+ data.pop_filter if fields.include?("#{entity_field}")
318
+ data.pop_filter if fields.include?("#{doc_field}")
319
+ keys
298
320
  end
299
- data.pop_filter if data.fields.include?("#{entity_field}")
300
- data.pop_filter if data.fields.include?("#{doc_field}")
301
- data.read
302
321
 
303
322
  else
304
- if raw == :check
305
- data.close
306
- return nil
307
- end
323
+ return nil if raw == :check
308
324
  end
309
325
 
310
326
  return data.values if raw
311
327
 
312
328
  start_pos = data.identify_field "Start"
313
- segments = data.values_at(*keys).collect{|annotation|
329
+ data.read_and_close do
330
+ data.chunked_values_at(keys).collect{|annotation|
331
+ begin
314
332
  pos = annotation[start_pos]
315
- Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
316
- }.compact
317
- data.close
318
-
319
- segments
333
+ Segment.load_tsv_values(text, annotation, fields) unless [-1, "-1", [-1], ["-1"]].include?(pos)
334
+ rescue
335
+ Log.exception $!
336
+ iif keys
337
+ iif [text, annotation]
338
+ end
339
+
340
+ }.compact
341
+ end
320
342
  ensure
321
343
  data.close
322
344
  end
@@ -348,7 +370,7 @@ class Corpus
348
370
  segment.segments[name] = annotations
349
371
  class << segment
350
372
  self
351
- end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__
373
+ end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__ + 1
352
374
  end
353
375
 
354
376
  segment
@@ -3,7 +3,7 @@ require 'rbbt/fix_width_table'
3
3
 
4
4
  module Segment
5
5
  extend Annotation
6
- self.annotation :offset
6
+ self.annotation :offset, :docid
7
7
 
8
8
  def segment_length
9
9
  begin
@@ -325,7 +325,7 @@ module Segment
325
325
  tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
326
326
 
327
327
  segments.each do |segment|
328
- tsv[segment.object_id.to_s] = self.tsv_values_for_segment(segment, fields)
328
+ tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
329
329
  end
330
330
 
331
331
  tsv
@@ -348,6 +348,14 @@ module Segment
348
348
  [offset, self.end] * ".."
349
349
  end
350
350
 
351
+ def segment_id
352
+ if self.respond_to?(:docid)
353
+ [docid, locus, Misc.obj2digest(info)] * ":"
354
+ else
355
+ Misc.obj2digest(info)
356
+ end
357
+ end
358
+
351
359
  #def ==(other)
352
360
  # self.text == other.text
353
361
  #end
@@ -1,46 +1,46 @@
1
1
  require 'rbbt/text/segment'
2
2
 
3
- module SegmentWithDocid
4
- extend Annotation
5
-
6
- self.annotation :docid
7
-
8
- def masked?
9
- self[0..5] == "MASKED"
10
- end
11
-
12
- def mask
13
- return self if masked?
14
- raise "Cannot mask an array of elements, they must be masked individually" if Array === self
15
- raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
16
- raise "Cannot mask a segment with no offset" if offset.nil?
17
- textual_position = ["MASKED", length] * ":"
18
- self.replace(textual_position)
19
- self
20
- end
21
-
22
- def unmasked_text
23
- return self unless masked?
24
- tag, length = self.split(":")
25
- Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
26
- end
27
-
28
- def unmask
29
- return self unless masked?
30
- self.replace(unmasked_text)
31
- self
32
- end
33
-
34
- def str_length
35
- self.length
36
- end
37
-
38
- def masked_length
39
- self.split(":").last.to_i
40
- end
41
-
42
- def segment_length
43
- masked? ? masked_length : str_length
44
- end
45
- end
46
-
3
+ #module SegmentWithDocid
4
+ # extend Annotation
5
+ #
6
+ # self.annotation :docid
7
+ #
8
+ # def masked?
9
+ # self[0..5] == "MASKED"
10
+ # end
11
+ #
12
+ # def mask
13
+ # return self if masked?
14
+ # raise "Cannot mask an array of elements, they must be masked individually" if Array === self
15
+ # raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
16
+ # raise "Cannot mask a segment with no offset" if offset.nil?
17
+ # textual_position = ["MASKED", length] * ":"
18
+ # self.replace(textual_position)
19
+ # self
20
+ # end
21
+ #
22
+ # def unmasked_text
23
+ # return self unless masked?
24
+ # tag, length = self.split(":")
25
+ # Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
26
+ # end
27
+ #
28
+ # def unmask
29
+ # return self unless masked?
30
+ # self.replace(unmasked_text)
31
+ # self
32
+ # end
33
+ #
34
+ # def str_length
35
+ # self.length
36
+ # end
37
+ #
38
+ # def masked_length
39
+ # self.split(":").last.to_i
40
+ # end
41
+ #
42
+ # def segment_length
43
+ # masked? ? masked_length : str_length
44
+ # end
45
+ #end
46
+ #
@@ -29,6 +29,7 @@ Score: #{score.inspect}
29
29
  end
30
30
 
31
31
  def entity(params = nil)
32
+ code = self.dup
32
33
  format, entity = code.split(":")
33
34
  entity, format = format, nil if entity.nil?
34
35
 
@@ -111,10 +111,10 @@ module Transformed
111
111
 
112
112
  self[updated_begin..updated_end] = new
113
113
 
114
- @transformed_segments[segment.object_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
114
+ @transformed_segments[segment.segment_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
115
115
 
116
116
  segment.replace original_text
117
- stack << segment.object_id
117
+ stack << segment.segment_id
118
118
  end
119
119
  @transformation_stack << stack
120
120
  end
@@ -8,7 +8,6 @@ class TestGNormPlus < Test::Unit::TestCase
8
8
  We found that TP53 is regulated by MDM2 in Homo sapiens
9
9
  EOF
10
10
 
11
-
12
11
  mentions = GNormPlus.process({:file => text})
13
12
  Log.tsv mentions
14
13
  end
@@ -10,10 +10,6 @@ class TestCorpusDocument < Test::Unit::TestCase
10
10
  Segment.align(self.text, words)
11
11
  end
12
12
 
13
- Open.mkdir Rbbt.tmp.test.annotations.find
14
- Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
15
-
16
-
17
13
  Corpus::Document.define_multiple :words2 do |documents|
18
14
  documents.collect do |doc|
19
15
  words = doc.text.split(" ")
@@ -21,32 +17,66 @@ class TestCorpusDocument < Test::Unit::TestCase
21
17
  end
22
18
  end
23
19
 
20
+ Open.mkdir Rbbt.tmp.test.annotations.find
21
+
22
+ Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
24
23
  Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
25
24
  end
26
25
 
27
26
  def test_words
28
27
  text = "This is a test document"
29
- document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc", text)
28
+ document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", text)
30
29
  assert_equal Segment.sort(document.words), text.split(" ")
30
+ assert document.words.first.docid
31
+ assert document.words.first.segment_id.include?("TEST")
31
32
  end
32
33
 
33
34
  def test_words_multiple
34
35
  document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
35
- document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
36
+ document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
37
+ document3 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc3:3", "This is yet another test document")
36
38
 
37
- docs = [document1, document2]
39
+ docs = [document1, document2, document3]
38
40
 
39
41
  Corpus::Document.prepare_multiple(docs, :words2)
40
-
42
+
43
+ assert document1.words.first.docid
44
+ assert document1.words.first.segment_id.include?("TEST")
45
+
41
46
  assert_equal document1.words2, document1.text.split(" ")
42
47
  assert_equal document2.words2, document2.text.split(" ")
48
+ assert_equal document3.words2, document3.text.split(" ")
43
49
 
44
50
  document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
45
- document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
51
+ document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
46
52
 
47
53
  docs = [document1, document2]
48
54
 
49
55
  Corpus::Document.prepare_multiple(docs, :words2)
50
56
  end
57
+
58
+ def test_parallel
59
+ text =<<-EOF
60
+ This is a test document number
61
+ EOF
62
+
63
+ docs = []
64
+ 100.times do |i|
65
+ docs << text.chomp + " " + i.to_s
66
+ end
67
+
68
+ Log.with_severity 0 do
69
+ TSV.traverse docs, :cpus => 10, :bar => true do |doc|
70
+ hash = Misc.digest(doc)
71
+ document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
72
+ assert_equal Segment.sort(document.words), document.text.split(" ")
73
+ end
74
+ TSV.traverse docs, :cpus => 10, :bar => true do |doc|
75
+ hash = Misc.digest(doc)
76
+ document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
77
+ assert_equal Segment.sort(document.words), document.text.split(" ")
78
+ end
79
+ end
80
+ end
51
81
  end
52
82
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.9
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-13 00:00:00.000000000 Z
11
+ date: 2020-04-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util