simple_bioc 0.0.20 → 0.0.21
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/simple_bioc/bioc_reader.rb +0 -2
- data/lib/simple_bioc/passage.rb +0 -3
- data/lib/simple_bioc/sentence.rb +0 -3
- data/lib/simple_bioc/version.rb +1 -1
- data/lib/simple_bioc.rb +0 -4
- data/spec/file_check_spec.rb +0 -52
- metadata +2 -4
- data/lib/simple_bioc/bioc_merger.rb +0 -229
- data/lib/simple_bioc/location_adjuster.rb +0 -45
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 344d68b7d1e117a05da96939c1ace1ff10cb630f
|
4
|
+
data.tar.gz: 549cfc037ec425cfa53815238aab5628044346c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e19302d994c531a1fe9ac2407a48c4517fbef8a6f0671bbc82a2e06e381613d0d48205b705ca372ddd9d8f63282b1845ac5775aec7bf603ed9ddc87fd3a599e9
|
7
|
+
data.tar.gz: fa6a72ce6773ee3ac3726be469e3bfa0b3fc808827f4da62926fe7fca7da67eb962fbf3ee4863a456ffc063e32dfcb77c9489efa0d4d5c77e47110bee34224f0
|
@@ -84,7 +84,6 @@ module BioCReader
|
|
84
84
|
read_recursive(xml, passage, "sentence")
|
85
85
|
read_recursive(xml, passage, "annotation")
|
86
86
|
read_recursive(xml, passage, "relation")
|
87
|
-
passage.adjust_annotation_offsets
|
88
87
|
true
|
89
88
|
end
|
90
89
|
|
@@ -94,7 +93,6 @@ module BioCReader
|
|
94
93
|
read_infon(xml, sentence)
|
95
94
|
read_recursive(xml, sentence, "annotation")
|
96
95
|
read_recursive(xml, sentence, "relation")
|
97
|
-
sentence.adjust_annotation_offsets
|
98
96
|
true
|
99
97
|
end
|
100
98
|
|
data/lib/simple_bioc/passage.rb
CHANGED
data/lib/simple_bioc/sentence.rb
CHANGED
data/lib/simple_bioc/version.rb
CHANGED
data/lib/simple_bioc.rb
CHANGED
@@ -42,10 +42,6 @@ module SimpleBioC
|
|
42
42
|
BioCReader.read_from_file_or_string(file, options)
|
43
43
|
end
|
44
44
|
|
45
|
-
def merge(dest_bioc, src_bioc)
|
46
|
-
return BioCMerger.merge(dest_bioc, src_bioc)
|
47
|
-
end
|
48
|
-
|
49
45
|
# parse a BioC XML string and convert it into a collection instance
|
50
46
|
#
|
51
47
|
# ==== Arguments
|
data/spec/file_check_spec.rb
CHANGED
@@ -11,62 +11,10 @@ describe "File Check" do
|
|
11
11
|
end
|
12
12
|
end
|
13
13
|
|
14
|
-
it "should merge documents successfully" do
|
15
|
-
col1 = SimpleBioC.from_xml("./xml/merge/9864355.xml")
|
16
|
-
col2 = SimpleBioC.from_xml("./xml/merge/9864355_1.xml")
|
17
|
-
col3 = SimpleBioC.from_xml("./xml/merge/9864355_2.xml")
|
18
|
-
col4 = SimpleBioC.from_xml("./xml/merge/9864355_3.xml")
|
19
|
-
|
20
|
-
SimpleBioC.merge(col1, col2)
|
21
|
-
SimpleBioC.merge(col1, col3)
|
22
|
-
SimpleBioC.merge(col1, col4)
|
23
|
-
output = SimpleBioC.to_xml(col1)
|
24
|
-
File.write("./xml/merge/output.xml", output)
|
25
|
-
puts "merge1"
|
26
|
-
col5 = SimpleBioC.from_xml("./xml/merge/output.xml")
|
27
|
-
end
|
28
|
-
|
29
|
-
it "should merge same documents successfully" do
|
30
|
-
col1 = SimpleBioC.from_xml("./xml/10330397_gene.xml")
|
31
|
-
col2 = SimpleBioC.from_xml("./xml/10330397_ppimention.xml")
|
32
|
-
|
33
|
-
SimpleBioC.merge(col1, col2)
|
34
|
-
output = SimpleBioC.to_xml(col1)
|
35
|
-
File.write("./xml/merge/output_10330397.xml", output)
|
36
|
-
puts "merge2"
|
37
|
-
col5 = SimpleBioC.from_xml("./xml/merge/output_10330397.xml")
|
38
|
-
end
|
39
|
-
|
40
14
|
it "should fix location problem" do
|
41
15
|
col1 = SimpleBioC.from_xml("./xml/merge/10366597_error.xml")
|
42
16
|
output = SimpleBioC.to_xml(col1)
|
43
17
|
File.write("./xml/merge/output_10366597.xml", output)
|
44
18
|
col5 = SimpleBioC.from_xml("./xml/merge/output_10366597.xml")
|
45
19
|
end
|
46
|
-
|
47
|
-
|
48
|
-
it "should merge documents successfully with different order" do
|
49
|
-
col4 = SimpleBioC.from_xml("./xml/merge/9864355.xml")
|
50
|
-
col3 = SimpleBioC.from_xml("./xml/merge/9864355_1.xml")
|
51
|
-
col1 = SimpleBioC.from_xml("./xml/merge/9864355_2.xml")
|
52
|
-
col2 = SimpleBioC.from_xml("./xml/merge/9864355_3.xml")
|
53
|
-
|
54
|
-
SimpleBioC.merge(col1, col2)
|
55
|
-
SimpleBioC.merge(col1, col3)
|
56
|
-
SimpleBioC.merge(col1, col4)
|
57
|
-
output = SimpleBioC.to_xml(col1)
|
58
|
-
File.write("./xml/merge/output.xml", output)
|
59
|
-
puts "merge12"
|
60
|
-
col5 = SimpleBioC.from_xml("./xml/merge/output.xml")
|
61
|
-
end
|
62
|
-
|
63
|
-
it "should load XML files successfully" do
|
64
|
-
Dir["./xml/*.xml"].each do |file_path|
|
65
|
-
puts "self-merge #{file_path}"
|
66
|
-
collection1 = SimpleBioC.from_xml(file_path)
|
67
|
-
collection2 = SimpleBioC.from_xml(file_path)
|
68
|
-
SimpleBioC.merge(collection1, collection2)
|
69
|
-
output = SimpleBioC.to_xml(collection1)
|
70
|
-
end
|
71
|
-
end
|
72
20
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_bioc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.21
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dongseop Kwon
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -185,13 +185,11 @@ files:
|
|
185
185
|
- html/table_of_contents.html
|
186
186
|
- lib/simple_bioc.rb
|
187
187
|
- lib/simple_bioc/annotation.rb
|
188
|
-
- lib/simple_bioc/bioc_merger.rb
|
189
188
|
- lib/simple_bioc/bioc_reader.rb
|
190
189
|
- lib/simple_bioc/bioc_writer.rb
|
191
190
|
- lib/simple_bioc/collection.rb
|
192
191
|
- lib/simple_bioc/document.rb
|
193
192
|
- lib/simple_bioc/location.rb
|
194
|
-
- lib/simple_bioc/location_adjuster.rb
|
195
193
|
- lib/simple_bioc/node.rb
|
196
194
|
- lib/simple_bioc/node_base.rb
|
197
195
|
- lib/simple_bioc/passage.rb
|
@@ -1,229 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
|
3
|
-
Dir[File.dirname(__FILE__) + '/*.rb'].each {|file| require file }
|
4
|
-
|
5
|
-
module BioCMerger
|
6
|
-
module_function
|
7
|
-
|
8
|
-
def merge(dest_collection, src_collection)
|
9
|
-
errors = []
|
10
|
-
warnings = []
|
11
|
-
id_map = {}
|
12
|
-
|
13
|
-
if dest_collection.documents.size != 1 || src_collection.documents.size != 1
|
14
|
-
warnings << 'Only the first documents will be merged'
|
15
|
-
end
|
16
|
-
|
17
|
-
doc_d = dest_collection.documents[0]
|
18
|
-
doc_s = src_collection.documents[0]
|
19
|
-
|
20
|
-
copy_infons(dest_collection, src_collection)
|
21
|
-
dest_collection.source = src_collection.source if dest_collection.source.nil? || dest_collection.source.empty?
|
22
|
-
dest_collection.date = src_collection.date if dest_collection.date.nil? || dest_collection.date.empty?
|
23
|
-
dest_collection.key = src_collection.key if dest_collection.key.nil? || dest_collection.key.empty?
|
24
|
-
|
25
|
-
copy_infons(doc_d, doc_s)
|
26
|
-
copy_relations(doc_d, doc_d, doc_s, id_map)
|
27
|
-
|
28
|
-
if doc_d.passages.size != doc_s.passages.size
|
29
|
-
warnings << 'Passages will not be merged because the numbers of passages in documents are different'
|
30
|
-
end
|
31
|
-
|
32
|
-
doc_d.passages.each_with_index do |p_d, index|
|
33
|
-
p_s = doc_s.passages[index]
|
34
|
-
if p_d.nil? || p_s.nil?
|
35
|
-
warnings << 'The number of sentences in pages should be the same'
|
36
|
-
elsif blank?(p_d.text) && blank?(p_s.text) && p_d.sentences.size != p_s.sentences.size
|
37
|
-
warnings << 'The number of sentences in pages should be the same'
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
doc_d.passages.each_with_index do |p_d, index|
|
42
|
-
p_s = doc_s.passages[index]
|
43
|
-
next if p_d.nil? || p_s.nil?
|
44
|
-
copy_relations(doc_d, p_d, p_s, id_map)
|
45
|
-
if p_d.sentences.size == p_s.sentences.size
|
46
|
-
p_d.sentences.each_with_index do |s_d, index|
|
47
|
-
s_s = p_s.sentences[index]
|
48
|
-
copy_infons(s_d, s_s)
|
49
|
-
copy_text(s_d, s_s)
|
50
|
-
copy_relations(doc_d, s_d, s_s, id_map)
|
51
|
-
copy_annotations(doc_d, s_d, s_s, id_map)
|
52
|
-
s_d.adjust_annotation_offsets
|
53
|
-
end
|
54
|
-
elsif p_d.sentences.size == 0
|
55
|
-
p_d.text = p_s.sentences.map{|s| s.text}.join(" ") if blank?(p_d.text)
|
56
|
-
p_s.sentences.each do |s|
|
57
|
-
copy_relations(doc_d, p_d, s, id_map)
|
58
|
-
copy_annotations(doc_d, p_d, s, id_map)
|
59
|
-
end
|
60
|
-
elsif p_s.sentences.size == 0
|
61
|
-
if p_d.sentences.size > 0
|
62
|
-
# dest has sentences, but src has only passages.
|
63
|
-
p_d.text = p_d.sentences.map{|s| s.text}.join(" ") if blank?(p_d.text)
|
64
|
-
p_d.sentences.each do |s|
|
65
|
-
s.annotations.each do |a|
|
66
|
-
a.clear_sentence
|
67
|
-
p_d.annotations << a
|
68
|
-
end
|
69
|
-
s.relations.each do |r|
|
70
|
-
r.clear_sentence
|
71
|
-
p_d.relations << r
|
72
|
-
end
|
73
|
-
end
|
74
|
-
p_d.sentences.clear
|
75
|
-
else
|
76
|
-
copy_text(p_d, p_s)
|
77
|
-
end
|
78
|
-
end
|
79
|
-
copy_annotations(doc_d, p_d, p_s, id_map)
|
80
|
-
p_d.adjust_annotation_offsets
|
81
|
-
end
|
82
|
-
puts warnings
|
83
|
-
end
|
84
|
-
|
85
|
-
def adjust_relation_refids(doc, id_map)
|
86
|
-
adjust_relation_refid(doc, id_map)
|
87
|
-
doc.passages.each do |p|
|
88
|
-
adjust_relation_refid(p, id_map)
|
89
|
-
p.sentences.each do |s|
|
90
|
-
adjust_relation_refid(s, id_map)
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
def adjust_relation_refid(obj, id_map)
|
96
|
-
return if obj.nil?
|
97
|
-
obj.relations.each do |r|
|
98
|
-
next if r.original.nil?
|
99
|
-
r.nodes.each do |n|
|
100
|
-
new_id = id_map[n.refid]
|
101
|
-
n.refid = new_id unless new_id.nil?
|
102
|
-
n.adjust_ref
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def copy_relations(doc, dest, src, id_map)
|
108
|
-
return if src.nil?
|
109
|
-
src.relations.each do |r|
|
110
|
-
copy_relation(doc, dest, r, id_map)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
def copy_annotations(doc, dest, src, id_map)
|
115
|
-
return if src.nil?
|
116
|
-
src.annotations.each do |a|
|
117
|
-
copy_annotation(doc, dest, a, id_map)
|
118
|
-
end
|
119
|
-
end
|
120
|
-
def copy_relation(doc, dest, relation, id_map)
|
121
|
-
new_r = nil
|
122
|
-
need_add = true
|
123
|
-
dest.relations.each do |r|
|
124
|
-
if r.id == relation.id
|
125
|
-
new_r = r
|
126
|
-
need_add = false
|
127
|
-
break
|
128
|
-
end
|
129
|
-
end
|
130
|
-
if new_r.nil?
|
131
|
-
new_r = SimpleBioC::Relation.new(dest)
|
132
|
-
new_r.id = choose_id(doc, relation.id, id_map)
|
133
|
-
new_r.original = relation
|
134
|
-
end
|
135
|
-
|
136
|
-
relation.nodes.each do |n|
|
137
|
-
found = false
|
138
|
-
new_r.nodes.each do |old_n|
|
139
|
-
if n.refid == old_n.refid && n.role == old_n.role
|
140
|
-
found = true
|
141
|
-
break
|
142
|
-
end
|
143
|
-
end
|
144
|
-
unless found
|
145
|
-
node = SimpleBioC::Node.new(new_r)
|
146
|
-
node.refid = n.refid
|
147
|
-
node.role = n.role
|
148
|
-
new_r.nodes << node
|
149
|
-
end
|
150
|
-
end
|
151
|
-
copy_infons(new_r, relation)
|
152
|
-
if need_add
|
153
|
-
dest.relations << new_r
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
def copy_annotation(doc, dest, annotation, id_map)
|
158
|
-
new_a = nil
|
159
|
-
need_add = true
|
160
|
-
dest.annotations.each do |a|
|
161
|
-
if a.id == annotation.id && a.text == annotation.text
|
162
|
-
new_a = a
|
163
|
-
need_add = false
|
164
|
-
break
|
165
|
-
end
|
166
|
-
end
|
167
|
-
if new_a.nil?
|
168
|
-
new_a = SimpleBioC::Annotation.new(dest)
|
169
|
-
new_a.id = choose_id(doc, annotation.id, id_map)
|
170
|
-
new_a.text = annotation.text
|
171
|
-
new_a.locations = []
|
172
|
-
end
|
173
|
-
|
174
|
-
annotation.locations.each do |l|
|
175
|
-
found = false
|
176
|
-
new_a.locations.each do |old_l|
|
177
|
-
if l.offset == old_l.offset && l.length == old_l.length
|
178
|
-
found = true
|
179
|
-
break
|
180
|
-
end
|
181
|
-
end
|
182
|
-
unless found
|
183
|
-
new_l = SimpleBioC::Location.new(new_a)
|
184
|
-
new_l.offset = l.offset
|
185
|
-
new_l.length = l.length
|
186
|
-
new_a.locations << new_l
|
187
|
-
end
|
188
|
-
end
|
189
|
-
copy_infons(new_a, annotation)
|
190
|
-
if need_add
|
191
|
-
dest.annotations << new_a
|
192
|
-
end
|
193
|
-
end
|
194
|
-
|
195
|
-
def choose_id(doc, id, id_map)
|
196
|
-
new_id = id || "id"
|
197
|
-
node = doc.find_node(new_id)
|
198
|
-
|
199
|
-
until node.nil? do
|
200
|
-
new_id = new_id + "_c"
|
201
|
-
node = doc.find_node(new_id)
|
202
|
-
end
|
203
|
-
|
204
|
-
if new_id != id
|
205
|
-
id_map[id] = new_id
|
206
|
-
end
|
207
|
-
return new_id
|
208
|
-
end
|
209
|
-
|
210
|
-
def copy_text(dest, src)
|
211
|
-
if blank?(dest.text) && !blank?(src.text)
|
212
|
-
dest.text = src.text
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
216
|
-
def blank?(text)
|
217
|
-
return text.nil? || text.empty?
|
218
|
-
end
|
219
|
-
|
220
|
-
def copy_infons(dest, src)
|
221
|
-
src.infons.each do |k, v|
|
222
|
-
if dest.infons[k].nil?
|
223
|
-
dest.infons[k] = v
|
224
|
-
elsif dest.infons[k] != v
|
225
|
-
|
226
|
-
end
|
227
|
-
end
|
228
|
-
end
|
229
|
-
end
|
@@ -1,45 +0,0 @@
|
|
1
|
-
module SimpleBioC
|
2
|
-
module LocationAdjuster
|
3
|
-
def adjust_annotation_offsets
|
4
|
-
obj = self
|
5
|
-
return if obj.nil? || obj.annotations.nil?
|
6
|
-
obj.annotations.each do |a|
|
7
|
-
positions = find_all_locations(obj, a.text)
|
8
|
-
next a.locations.nil?
|
9
|
-
a.locations.each do |l|
|
10
|
-
next if l.nil? || l == false
|
11
|
-
# l.original_offset = l.offset.to_i if l.original_offset.nil?
|
12
|
-
l.offset = choose_offset_candidate(l.offset, positions)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
module_function
|
18
|
-
|
19
|
-
def find_all_locations(obj, text)
|
20
|
-
positions = []
|
21
|
-
return positions if obj.nil? || obj.text.nil?
|
22
|
-
pos = obj.text.index(text)
|
23
|
-
until pos.nil?
|
24
|
-
positions << (pos + obj.offset)
|
25
|
-
pos = obj.text.index(text, pos + 1)
|
26
|
-
end
|
27
|
-
return positions
|
28
|
-
end
|
29
|
-
|
30
|
-
def choose_offset_candidate(offset, positions)
|
31
|
-
return offset if positions.nil?
|
32
|
-
min_diff = 99999
|
33
|
-
offset = offset.to_i
|
34
|
-
ret = offset
|
35
|
-
positions.each do |p|
|
36
|
-
diff = (offset - p).abs
|
37
|
-
if diff < min_diff
|
38
|
-
ret = p
|
39
|
-
min_diff = diff
|
40
|
-
end
|
41
|
-
end
|
42
|
-
return ret
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|