simple_bioc 0.0.20 → 0.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/simple_bioc/bioc_reader.rb +0 -2
- data/lib/simple_bioc/passage.rb +0 -3
- data/lib/simple_bioc/sentence.rb +0 -3
- data/lib/simple_bioc/version.rb +1 -1
- data/lib/simple_bioc.rb +0 -4
- data/spec/file_check_spec.rb +0 -52
- metadata +2 -4
- data/lib/simple_bioc/bioc_merger.rb +0 -229
- data/lib/simple_bioc/location_adjuster.rb +0 -45
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 344d68b7d1e117a05da96939c1ace1ff10cb630f
|
4
|
+
data.tar.gz: 549cfc037ec425cfa53815238aab5628044346c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e19302d994c531a1fe9ac2407a48c4517fbef8a6f0671bbc82a2e06e381613d0d48205b705ca372ddd9d8f63282b1845ac5775aec7bf603ed9ddc87fd3a599e9
|
7
|
+
data.tar.gz: fa6a72ce6773ee3ac3726be469e3bfa0b3fc808827f4da62926fe7fca7da67eb962fbf3ee4863a456ffc063e32dfcb77c9489efa0d4d5c77e47110bee34224f0
|
@@ -84,7 +84,6 @@ module BioCReader
|
|
84
84
|
read_recursive(xml, passage, "sentence")
|
85
85
|
read_recursive(xml, passage, "annotation")
|
86
86
|
read_recursive(xml, passage, "relation")
|
87
|
-
passage.adjust_annotation_offsets
|
88
87
|
true
|
89
88
|
end
|
90
89
|
|
@@ -94,7 +93,6 @@ module BioCReader
|
|
94
93
|
read_infon(xml, sentence)
|
95
94
|
read_recursive(xml, sentence, "annotation")
|
96
95
|
read_recursive(xml, sentence, "relation")
|
97
|
-
sentence.adjust_annotation_offsets
|
98
96
|
true
|
99
97
|
end
|
100
98
|
|
data/lib/simple_bioc/passage.rb
CHANGED
data/lib/simple_bioc/sentence.rb
CHANGED
data/lib/simple_bioc/version.rb
CHANGED
data/lib/simple_bioc.rb
CHANGED
@@ -42,10 +42,6 @@ module SimpleBioC
|
|
42
42
|
BioCReader.read_from_file_or_string(file, options)
|
43
43
|
end
|
44
44
|
|
45
|
-
def merge(dest_bioc, src_bioc)
|
46
|
-
return BioCMerger.merge(dest_bioc, src_bioc)
|
47
|
-
end
|
48
|
-
|
49
45
|
# parse a BioC XML string and convert it into a collection instance
|
50
46
|
#
|
51
47
|
# ==== Arguments
|
data/spec/file_check_spec.rb
CHANGED
@@ -11,62 +11,10 @@ describe "File Check" do
|
|
11
11
|
end
|
12
12
|
end
|
13
13
|
|
14
|
-
it "should merge documents successfully" do
|
15
|
-
col1 = SimpleBioC.from_xml("./xml/merge/9864355.xml")
|
16
|
-
col2 = SimpleBioC.from_xml("./xml/merge/9864355_1.xml")
|
17
|
-
col3 = SimpleBioC.from_xml("./xml/merge/9864355_2.xml")
|
18
|
-
col4 = SimpleBioC.from_xml("./xml/merge/9864355_3.xml")
|
19
|
-
|
20
|
-
SimpleBioC.merge(col1, col2)
|
21
|
-
SimpleBioC.merge(col1, col3)
|
22
|
-
SimpleBioC.merge(col1, col4)
|
23
|
-
output = SimpleBioC.to_xml(col1)
|
24
|
-
File.write("./xml/merge/output.xml", output)
|
25
|
-
puts "merge1"
|
26
|
-
col5 = SimpleBioC.from_xml("./xml/merge/output.xml")
|
27
|
-
end
|
28
|
-
|
29
|
-
it "should merge same documents successfully" do
|
30
|
-
col1 = SimpleBioC.from_xml("./xml/10330397_gene.xml")
|
31
|
-
col2 = SimpleBioC.from_xml("./xml/10330397_ppimention.xml")
|
32
|
-
|
33
|
-
SimpleBioC.merge(col1, col2)
|
34
|
-
output = SimpleBioC.to_xml(col1)
|
35
|
-
File.write("./xml/merge/output_10330397.xml", output)
|
36
|
-
puts "merge2"
|
37
|
-
col5 = SimpleBioC.from_xml("./xml/merge/output_10330397.xml")
|
38
|
-
end
|
39
|
-
|
40
14
|
it "should fix location problem" do
|
41
15
|
col1 = SimpleBioC.from_xml("./xml/merge/10366597_error.xml")
|
42
16
|
output = SimpleBioC.to_xml(col1)
|
43
17
|
File.write("./xml/merge/output_10366597.xml", output)
|
44
18
|
col5 = SimpleBioC.from_xml("./xml/merge/output_10366597.xml")
|
45
19
|
end
|
46
|
-
|
47
|
-
|
48
|
-
it "should merge documents successfully with different order" do
|
49
|
-
col4 = SimpleBioC.from_xml("./xml/merge/9864355.xml")
|
50
|
-
col3 = SimpleBioC.from_xml("./xml/merge/9864355_1.xml")
|
51
|
-
col1 = SimpleBioC.from_xml("./xml/merge/9864355_2.xml")
|
52
|
-
col2 = SimpleBioC.from_xml("./xml/merge/9864355_3.xml")
|
53
|
-
|
54
|
-
SimpleBioC.merge(col1, col2)
|
55
|
-
SimpleBioC.merge(col1, col3)
|
56
|
-
SimpleBioC.merge(col1, col4)
|
57
|
-
output = SimpleBioC.to_xml(col1)
|
58
|
-
File.write("./xml/merge/output.xml", output)
|
59
|
-
puts "merge12"
|
60
|
-
col5 = SimpleBioC.from_xml("./xml/merge/output.xml")
|
61
|
-
end
|
62
|
-
|
63
|
-
it "should load XML files successfully" do
|
64
|
-
Dir["./xml/*.xml"].each do |file_path|
|
65
|
-
puts "self-merge #{file_path}"
|
66
|
-
collection1 = SimpleBioC.from_xml(file_path)
|
67
|
-
collection2 = SimpleBioC.from_xml(file_path)
|
68
|
-
SimpleBioC.merge(collection1, collection2)
|
69
|
-
output = SimpleBioC.to_xml(collection1)
|
70
|
-
end
|
71
|
-
end
|
72
20
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: simple_bioc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.21
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dongseop Kwon
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -185,13 +185,11 @@ files:
|
|
185
185
|
- html/table_of_contents.html
|
186
186
|
- lib/simple_bioc.rb
|
187
187
|
- lib/simple_bioc/annotation.rb
|
188
|
-
- lib/simple_bioc/bioc_merger.rb
|
189
188
|
- lib/simple_bioc/bioc_reader.rb
|
190
189
|
- lib/simple_bioc/bioc_writer.rb
|
191
190
|
- lib/simple_bioc/collection.rb
|
192
191
|
- lib/simple_bioc/document.rb
|
193
192
|
- lib/simple_bioc/location.rb
|
194
|
-
- lib/simple_bioc/location_adjuster.rb
|
195
193
|
- lib/simple_bioc/node.rb
|
196
194
|
- lib/simple_bioc/node_base.rb
|
197
195
|
- lib/simple_bioc/passage.rb
|
@@ -1,229 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
|
3
|
-
Dir[File.dirname(__FILE__) + '/*.rb'].each {|file| require file }
|
4
|
-
|
5
|
-
module BioCMerger
|
6
|
-
module_function
|
7
|
-
|
8
|
-
def merge(dest_collection, src_collection)
|
9
|
-
errors = []
|
10
|
-
warnings = []
|
11
|
-
id_map = {}
|
12
|
-
|
13
|
-
if dest_collection.documents.size != 1 || src_collection.documents.size != 1
|
14
|
-
warnings << 'Only the first documents will be merged'
|
15
|
-
end
|
16
|
-
|
17
|
-
doc_d = dest_collection.documents[0]
|
18
|
-
doc_s = src_collection.documents[0]
|
19
|
-
|
20
|
-
copy_infons(dest_collection, src_collection)
|
21
|
-
dest_collection.source = src_collection.source if dest_collection.source.nil? || dest_collection.source.empty?
|
22
|
-
dest_collection.date = src_collection.date if dest_collection.date.nil? || dest_collection.date.empty?
|
23
|
-
dest_collection.key = src_collection.key if dest_collection.key.nil? || dest_collection.key.empty?
|
24
|
-
|
25
|
-
copy_infons(doc_d, doc_s)
|
26
|
-
copy_relations(doc_d, doc_d, doc_s, id_map)
|
27
|
-
|
28
|
-
if doc_d.passages.size != doc_s.passages.size
|
29
|
-
warnings << 'Passages will not be merged because the numbers of passages in documents are different'
|
30
|
-
end
|
31
|
-
|
32
|
-
doc_d.passages.each_with_index do |p_d, index|
|
33
|
-
p_s = doc_s.passages[index]
|
34
|
-
if p_d.nil? || p_s.nil?
|
35
|
-
warnings << 'The number of sentences in pages should be the same'
|
36
|
-
elsif blank?(p_d.text) && blank?(p_s.text) && p_d.sentences.size != p_s.sentences.size
|
37
|
-
warnings << 'The number of sentences in pages should be the same'
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
doc_d.passages.each_with_index do |p_d, index|
|
42
|
-
p_s = doc_s.passages[index]
|
43
|
-
next if p_d.nil? || p_s.nil?
|
44
|
-
copy_relations(doc_d, p_d, p_s, id_map)
|
45
|
-
if p_d.sentences.size == p_s.sentences.size
|
46
|
-
p_d.sentences.each_with_index do |s_d, index|
|
47
|
-
s_s = p_s.sentences[index]
|
48
|
-
copy_infons(s_d, s_s)
|
49
|
-
copy_text(s_d, s_s)
|
50
|
-
copy_relations(doc_d, s_d, s_s, id_map)
|
51
|
-
copy_annotations(doc_d, s_d, s_s, id_map)
|
52
|
-
s_d.adjust_annotation_offsets
|
53
|
-
end
|
54
|
-
elsif p_d.sentences.size == 0
|
55
|
-
p_d.text = p_s.sentences.map{|s| s.text}.join(" ") if blank?(p_d.text)
|
56
|
-
p_s.sentences.each do |s|
|
57
|
-
copy_relations(doc_d, p_d, s, id_map)
|
58
|
-
copy_annotations(doc_d, p_d, s, id_map)
|
59
|
-
end
|
60
|
-
elsif p_s.sentences.size == 0
|
61
|
-
if p_d.sentences.size > 0
|
62
|
-
# dest has sentences, but src has only passages.
|
63
|
-
p_d.text = p_d.sentences.map{|s| s.text}.join(" ") if blank?(p_d.text)
|
64
|
-
p_d.sentences.each do |s|
|
65
|
-
s.annotations.each do |a|
|
66
|
-
a.clear_sentence
|
67
|
-
p_d.annotations << a
|
68
|
-
end
|
69
|
-
s.relations.each do |r|
|
70
|
-
r.clear_sentence
|
71
|
-
p_d.relations << r
|
72
|
-
end
|
73
|
-
end
|
74
|
-
p_d.sentences.clear
|
75
|
-
else
|
76
|
-
copy_text(p_d, p_s)
|
77
|
-
end
|
78
|
-
end
|
79
|
-
copy_annotations(doc_d, p_d, p_s, id_map)
|
80
|
-
p_d.adjust_annotation_offsets
|
81
|
-
end
|
82
|
-
puts warnings
|
83
|
-
end
|
84
|
-
|
85
|
-
def adjust_relation_refids(doc, id_map)
|
86
|
-
adjust_relation_refid(doc, id_map)
|
87
|
-
doc.passages.each do |p|
|
88
|
-
adjust_relation_refid(p, id_map)
|
89
|
-
p.sentences.each do |s|
|
90
|
-
adjust_relation_refid(s, id_map)
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
def adjust_relation_refid(obj, id_map)
|
96
|
-
return if obj.nil?
|
97
|
-
obj.relations.each do |r|
|
98
|
-
next if r.original.nil?
|
99
|
-
r.nodes.each do |n|
|
100
|
-
new_id = id_map[n.refid]
|
101
|
-
n.refid = new_id unless new_id.nil?
|
102
|
-
n.adjust_ref
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def copy_relations(doc, dest, src, id_map)
|
108
|
-
return if src.nil?
|
109
|
-
src.relations.each do |r|
|
110
|
-
copy_relation(doc, dest, r, id_map)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
def copy_annotations(doc, dest, src, id_map)
|
115
|
-
return if src.nil?
|
116
|
-
src.annotations.each do |a|
|
117
|
-
copy_annotation(doc, dest, a, id_map)
|
118
|
-
end
|
119
|
-
end
|
120
|
-
def copy_relation(doc, dest, relation, id_map)
|
121
|
-
new_r = nil
|
122
|
-
need_add = true
|
123
|
-
dest.relations.each do |r|
|
124
|
-
if r.id == relation.id
|
125
|
-
new_r = r
|
126
|
-
need_add = false
|
127
|
-
break
|
128
|
-
end
|
129
|
-
end
|
130
|
-
if new_r.nil?
|
131
|
-
new_r = SimpleBioC::Relation.new(dest)
|
132
|
-
new_r.id = choose_id(doc, relation.id, id_map)
|
133
|
-
new_r.original = relation
|
134
|
-
end
|
135
|
-
|
136
|
-
relation.nodes.each do |n|
|
137
|
-
found = false
|
138
|
-
new_r.nodes.each do |old_n|
|
139
|
-
if n.refid == old_n.refid && n.role == old_n.role
|
140
|
-
found = true
|
141
|
-
break
|
142
|
-
end
|
143
|
-
end
|
144
|
-
unless found
|
145
|
-
node = SimpleBioC::Node.new(new_r)
|
146
|
-
node.refid = n.refid
|
147
|
-
node.role = n.role
|
148
|
-
new_r.nodes << node
|
149
|
-
end
|
150
|
-
end
|
151
|
-
copy_infons(new_r, relation)
|
152
|
-
if need_add
|
153
|
-
dest.relations << new_r
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
def copy_annotation(doc, dest, annotation, id_map)
|
158
|
-
new_a = nil
|
159
|
-
need_add = true
|
160
|
-
dest.annotations.each do |a|
|
161
|
-
if a.id == annotation.id && a.text == annotation.text
|
162
|
-
new_a = a
|
163
|
-
need_add = false
|
164
|
-
break
|
165
|
-
end
|
166
|
-
end
|
167
|
-
if new_a.nil?
|
168
|
-
new_a = SimpleBioC::Annotation.new(dest)
|
169
|
-
new_a.id = choose_id(doc, annotation.id, id_map)
|
170
|
-
new_a.text = annotation.text
|
171
|
-
new_a.locations = []
|
172
|
-
end
|
173
|
-
|
174
|
-
annotation.locations.each do |l|
|
175
|
-
found = false
|
176
|
-
new_a.locations.each do |old_l|
|
177
|
-
if l.offset == old_l.offset && l.length == old_l.length
|
178
|
-
found = true
|
179
|
-
break
|
180
|
-
end
|
181
|
-
end
|
182
|
-
unless found
|
183
|
-
new_l = SimpleBioC::Location.new(new_a)
|
184
|
-
new_l.offset = l.offset
|
185
|
-
new_l.length = l.length
|
186
|
-
new_a.locations << new_l
|
187
|
-
end
|
188
|
-
end
|
189
|
-
copy_infons(new_a, annotation)
|
190
|
-
if need_add
|
191
|
-
dest.annotations << new_a
|
192
|
-
end
|
193
|
-
end
|
194
|
-
|
195
|
-
def choose_id(doc, id, id_map)
|
196
|
-
new_id = id || "id"
|
197
|
-
node = doc.find_node(new_id)
|
198
|
-
|
199
|
-
until node.nil? do
|
200
|
-
new_id = new_id + "_c"
|
201
|
-
node = doc.find_node(new_id)
|
202
|
-
end
|
203
|
-
|
204
|
-
if new_id != id
|
205
|
-
id_map[id] = new_id
|
206
|
-
end
|
207
|
-
return new_id
|
208
|
-
end
|
209
|
-
|
210
|
-
def copy_text(dest, src)
|
211
|
-
if blank?(dest.text) && !blank?(src.text)
|
212
|
-
dest.text = src.text
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
216
|
-
def blank?(text)
|
217
|
-
return text.nil? || text.empty?
|
218
|
-
end
|
219
|
-
|
220
|
-
def copy_infons(dest, src)
|
221
|
-
src.infons.each do |k, v|
|
222
|
-
if dest.infons[k].nil?
|
223
|
-
dest.infons[k] = v
|
224
|
-
elsif dest.infons[k] != v
|
225
|
-
|
226
|
-
end
|
227
|
-
end
|
228
|
-
end
|
229
|
-
end
|
@@ -1,45 +0,0 @@
|
|
1
|
-
module SimpleBioC
|
2
|
-
module LocationAdjuster
|
3
|
-
def adjust_annotation_offsets
|
4
|
-
obj = self
|
5
|
-
return if obj.nil? || obj.annotations.nil?
|
6
|
-
obj.annotations.each do |a|
|
7
|
-
positions = find_all_locations(obj, a.text)
|
8
|
-
next a.locations.nil?
|
9
|
-
a.locations.each do |l|
|
10
|
-
next if l.nil? || l == false
|
11
|
-
# l.original_offset = l.offset.to_i if l.original_offset.nil?
|
12
|
-
l.offset = choose_offset_candidate(l.offset, positions)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
module_function
|
18
|
-
|
19
|
-
def find_all_locations(obj, text)
|
20
|
-
positions = []
|
21
|
-
return positions if obj.nil? || obj.text.nil?
|
22
|
-
pos = obj.text.index(text)
|
23
|
-
until pos.nil?
|
24
|
-
positions << (pos + obj.offset)
|
25
|
-
pos = obj.text.index(text, pos + 1)
|
26
|
-
end
|
27
|
-
return positions
|
28
|
-
end
|
29
|
-
|
30
|
-
def choose_offset_candidate(offset, positions)
|
31
|
-
return offset if positions.nil?
|
32
|
-
min_diff = 99999
|
33
|
-
offset = offset.to_i
|
34
|
-
ret = offset
|
35
|
-
positions.each do |p|
|
36
|
-
diff = (offset - p).abs
|
37
|
-
if diff < min_diff
|
38
|
-
ret = p
|
39
|
-
min_diff = diff
|
40
|
-
end
|
41
|
-
end
|
42
|
-
return ret
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|