obo_parser 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +12 -3
- data/VERSION +1 -1
- data/lib/tokens.rb +3 -2
- data/lib/utilities.rb +193 -53
- data/obo_parser.gemspec +26 -28
- data/test/cell.obo +1 -1
- data/test/go.obo +18523 -0
- data/test/hao.obo +14175 -0
- data/test/test_obo_parser.rb +38 -4
- data/test/tgma.obo +18522 -0
- metadata +12 -12
- data/.gitignore +0 -21
data/README.rdoc
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
= obo_parser
|
2
2
|
|
3
|
-
A simple Ruby gem for parsing OBO 1.2 formatted ontology files. Useful for reporting, comparing, and mapping data to other databases. There is presently no functionality for logical inference across the ontology.
|
3
|
+
A simple Ruby gem for parsing OBO 1.2 (?4) formatted ontology files. Useful for reporting, comparing, and mapping data to other databases. There is presently no functionality for logical inference across the ontology.
|
4
4
|
|
5
5
|
== Installation
|
6
6
|
|
@@ -8,6 +8,8 @@ A simple Ruby gem for parsing OBO 1.2 formatted ontology files. Useful for repo
|
|
8
8
|
|
9
9
|
== Use
|
10
10
|
|
11
|
+
=== General
|
12
|
+
|
11
13
|
require 'rubygems'
|
12
14
|
require 'obo_parser'
|
13
15
|
foo = parse_obo_file(File.read('my_ontology.obo')) # => An OboParser instance
|
@@ -36,13 +38,20 @@ A simple Ruby gem for parsing OBO 1.2 formatted ontology files. Useful for repo
|
|
36
38
|
|
37
39
|
foo.terms.first.relationships # => [['relation_ship', 'FOO:123'], ['other_relationship', 'FOO:456'] ...] An array of [relation, related term id], includes 'is_a', 'disjoint_from' and Typedefs
|
38
40
|
|
41
|
+
=== Convenience methods
|
42
|
+
|
43
|
+
foo.term_hash # => { term (String) => id (String), ... for each [Term] in the file. } !! Assumes names terms are unique, they might not be, in which case you get key collisions.
|
44
|
+
foo.id_hash # => { id (String) => term (String), ... for each [Term] in the file. }
|
45
|
+
|
39
46
|
See also /test/test_obo_parser.rb
|
40
47
|
|
41
48
|
== Utilties
|
42
49
|
|
43
|
-
|
50
|
+
A small set of methods (e.g. comparing OBO ontologies) utilizing the gem are included in /lib/utilities.rb. For example, shared labels across sets of ontologies can be found and returned.
|
51
|
+
|
52
|
+
== Documentation
|
44
53
|
|
45
|
-
|
54
|
+
Code documentation is slowly being formalized using Yard.
|
46
55
|
|
47
56
|
== Copyright
|
48
57
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.5
|
data/lib/tokens.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module OboParser::Tokens
|
2
2
|
|
3
3
|
class Token
|
4
|
-
# this allows access the
|
4
|
+
# this allows access the to class attribute regexp, without using a class variable
|
5
5
|
class << self; attr_reader :regexp; end
|
6
6
|
attr_reader :value
|
7
7
|
def initialize(str)
|
@@ -17,7 +17,7 @@ module OboParser::Tokens
|
|
17
17
|
@regexp = Regexp.new(/\A\s*(\[typedef\])\s*/i)
|
18
18
|
end
|
19
19
|
|
20
|
-
# Token
|
20
|
+
# Token needs simplification, likely through creating additional tokens for quoted qualifiers, optional modifiers ({}), and the creation of individual
|
21
21
|
# tokens for individual tags that don't conform to the pattern used for def: tags.
|
22
22
|
# The code can't presently handle escaped characters (like \,), as bizzarely found in some OBO files.
|
23
23
|
class TagValuePair < Token
|
@@ -66,6 +66,7 @@ module OboParser::Tokens
|
|
66
66
|
qq = 0 # some failsafes
|
67
67
|
while xref_list.length > 0
|
68
68
|
qq += 1
|
69
|
+
debugger if qq == 499
|
69
70
|
raise "#{xref_list}" if qq > 500
|
70
71
|
xref_list.gsub!(/\A\s*,\s*/, '')
|
71
72
|
|
data/lib/utilities.rb
CHANGED
@@ -4,79 +4,61 @@ require File.expand_path(File.join(File.dirname(__FILE__), 'obo_parser'))
|
|
4
4
|
|
5
5
|
module OboParser::Utilities
|
6
6
|
|
7
|
-
#
|
8
|
-
|
9
|
-
|
10
|
-
# of3 = File.read('hao3.obo')
|
11
|
-
# of4 = File.read('hao4.obo')
|
7
|
+
# Summarizes labels used by id in a two column tab delimited format
|
8
|
+
# Providing a cutoff will report only those ids/labels with > 1 label per id
|
9
|
+
# Does not (yet) include reference to synonyms, this could be easily extended.
|
12
10
|
#
|
13
|
-
|
14
|
-
|
11
|
+
#== Example use
|
12
|
+
# of1 = File.read('foo1.obo')
|
13
|
+
# of2 = File.read('foo2.obo')
|
14
|
+
# of3 = File.read('foo3.obo')
|
15
|
+
# of4 = File.read('foo4.obo')
|
16
|
+
#
|
17
|
+
# OboParser::Utilities.dump_comparison_by_id(0,[of1, of2, of3, of4])
|
18
|
+
#
|
19
|
+
# @param [Integer] cutoff only Term ids with > cutoff labels will be reported
|
20
|
+
# @param [Array] files an Array of read files
|
21
|
+
# @return [String] the transation in tab delimted format
|
22
|
+
def self.dump_comparison_by_id(cutoff = 0, files = [])
|
23
|
+
return '' if files.size < 1
|
24
|
+
|
15
25
|
of = []
|
16
26
|
files.each_with_index do |f, i|
|
17
27
|
of[i] = parse_obo_file(f)
|
18
28
|
end
|
19
|
-
|
29
|
+
|
20
30
|
all_data = {}
|
21
31
|
|
22
32
|
of.each do |f|
|
23
33
|
tmp_hash = f.id_hash
|
24
34
|
tmp_hash.keys.each do |id|
|
25
35
|
if all_data[id]
|
26
|
-
all_data[id].push
|
36
|
+
all_data[id].push(tmp_hash[id])
|
27
37
|
else
|
28
38
|
all_data[id] = [tmp_hash[id]]
|
29
39
|
end
|
30
40
|
end
|
31
41
|
end
|
32
42
|
|
33
|
-
puts "\nA list of all labels used across all submitted files for a given ID\n\n"
|
34
43
|
all_data.keys.sort.each do |k|
|
35
|
-
if all_data[k].uniq.size >
|
36
|
-
puts "#{k}\t
|
44
|
+
if all_data[k].uniq.size > cutoff
|
45
|
+
puts "#{k}\t#{all_data[k].uniq.join(', ')}"
|
37
46
|
end
|
38
47
|
end
|
39
48
|
end
|
40
49
|
|
41
|
-
#
|
42
|
-
#
|
43
|
-
|
44
|
-
agreement = ARGV[0]
|
45
|
-
raise "Provide a file with comparison." if agreement.nil?
|
46
|
-
comparison = File.read(agreement)
|
47
|
-
|
48
|
-
obo_files = Dir.entries('.').inject([]){|sum, a| sum.push( a =~ /\.obo\Z/ ? a : nil)}.compact!
|
49
|
-
identifiers = {}
|
50
|
-
|
51
|
-
obo_files.each do |f|
|
52
|
-
puts "Reading: #{f}"
|
53
|
-
identifiers.merge!( parse_obo_file(File.read(f)).id_hash )
|
54
|
-
end
|
55
|
-
|
56
|
-
comparison.each do |l|
|
57
|
-
v1, v2 = l.split("\t")
|
58
|
-
# puts "#{v1} - #{v2}"
|
59
|
-
|
60
|
-
next if v1.nil? || v2.nil?
|
61
|
-
|
62
|
-
v1.gsub!(/_/, ":")
|
63
|
-
v1.strip!
|
64
|
-
v2.gsub!(/_/, ":")
|
65
|
-
v2.strip!
|
66
|
-
|
67
|
-
puts (identifiers[v1].nil? ? 'NOT FOUND' : identifiers[v1]) +
|
68
|
-
"\t" +
|
69
|
-
(identifiers[v2].nil? ? 'NOT FOUND' : identifiers[v2])
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
# Returns labels found in all passed ontologies
|
74
|
-
# Usage:
|
50
|
+
# Returns all labels found in all passed ontologies. Does not yet include synonyms.
|
51
|
+
#
|
52
|
+
#== Example use
|
75
53
|
# of1 = File.read('fly_anatomy.obo')
|
76
54
|
# of2 = File.read('hao.obo')
|
77
55
|
# of3 = File.read('mosquito_anatomy.obo')
|
78
|
-
#
|
79
|
-
|
56
|
+
#
|
57
|
+
# OboParser::Utilities.shared_labels([of1, of3])
|
58
|
+
#
|
59
|
+
# @param [Array] files an Array of read files
|
60
|
+
# @return [String] lables, one per line
|
61
|
+
def self.shared_labels(files = [])
|
80
62
|
comparison = {}
|
81
63
|
|
82
64
|
files.each do |f|
|
@@ -91,17 +73,175 @@ module OboParser::Utilities
|
|
91
73
|
end
|
92
74
|
end
|
93
75
|
end
|
94
|
-
|
95
|
-
|
76
|
+
|
77
|
+
match = []
|
96
78
|
comparison.keys.each do |k|
|
97
79
|
if comparison[k] == files.size
|
98
80
|
match.push k
|
99
81
|
end
|
100
82
|
end
|
101
83
|
|
102
|
-
|
103
|
-
|
84
|
+
puts match.sort.join("\n")
|
85
|
+
puts "\n#{match.length} total."
|
104
86
|
|
105
87
|
end
|
106
|
-
|
88
|
+
|
89
|
+
|
90
|
+
#== Two column translation tools
|
91
|
+
|
92
|
+
HOMOLONTO_HEADER = %{
|
93
|
+
format-version: 1.2
|
94
|
+
auto-generated-by: obo_parser
|
95
|
+
default-namespace: fix_me
|
96
|
+
|
97
|
+
[Typedef]
|
98
|
+
id: OGEE:has_member
|
99
|
+
name: has_member
|
100
|
+
is_a: OBO_REL:relationship
|
101
|
+
def: "C has_member C', C is an homology group and C' is a biological object" []
|
102
|
+
comment: "We leave open the possibility that an homology group is a biological object. Thus, an homology group C may have C' has_member, with C' being an homology group."
|
103
|
+
is_transitive: true
|
104
|
+
is_anti_symmetric: true
|
105
|
+
|
106
|
+
}
|
107
|
+
|
108
|
+
|
109
|
+
# Takes a two column input file, references it to two ontologies, and provides a report.
|
110
|
+
#
|
111
|
+
#== Example use
|
112
|
+
# file = File.read('HAO_TGMA_list.txt')
|
113
|
+
# col1_obo = File.read('hao.obo')
|
114
|
+
# col2_obo = File.read('tgma.obo')
|
115
|
+
# column_translate(:data => file, :col1_obo => col1_obo, :col2_obo => col2_obo, :output => :homolonto)
|
116
|
+
#
|
117
|
+
# OboParser::Utilities.column_translate(:data => file, :col1_obo => col1_obo, :col2_obo => col2_obo, :output => :homolonto)
|
118
|
+
#== Output types
|
119
|
+
# There are several output report types
|
120
|
+
# :xls - Translates the columns in the data_file to the option passed in :translate_to, the first matching against col1_obo, the second against col2_obo. Returns an Excel file.
|
121
|
+
# :homolonto - Generates a homolonto compatible file to STDOUT
|
122
|
+
# :cols - Prints a two column format to STDOUT
|
123
|
+
#
|
124
|
+
# @param [Hash] options options.
|
125
|
+
# @param [Symbol] data the two column data file.
|
126
|
+
# @return [String] the transation in tab delimted format.
|
127
|
+
def self.column_translate(options = {})
|
128
|
+
opt = {
|
129
|
+
:data => nil,
|
130
|
+
:col1_obo => nil,
|
131
|
+
:col2_obo => nil,
|
132
|
+
:translate_to => :id, # also :label
|
133
|
+
:output => :cols, # also :xls, :homolonto
|
134
|
+
:output_filename => 'foo',
|
135
|
+
:index_start => 0
|
136
|
+
}.merge!(options)
|
137
|
+
|
138
|
+
c1obo = parse_obo_file(opt[:col1_obo])
|
139
|
+
c2obo = parse_obo_file(opt[:col2_obo])
|
140
|
+
|
141
|
+
case opt[:output]
|
142
|
+
when :xls
|
143
|
+
Spreadsheet.client_encoding = 'UTF-8'
|
144
|
+
book = Spreadsheet::Workbook.new
|
145
|
+
sheet = book.create_worksheet
|
146
|
+
when :homolonto
|
147
|
+
s = HOMOLONTO_HEADER
|
148
|
+
opt[:translate_to] = :id # force this in this mode
|
149
|
+
end
|
150
|
+
|
151
|
+
i = opt[:index_start]
|
152
|
+
v1 = nil # a label like 'head'
|
153
|
+
v2 = nil
|
154
|
+
c1 = nil # an id 'FOO:123'
|
155
|
+
c2 = nil
|
156
|
+
|
157
|
+
opt[:data].split(/\n/).each do |row|
|
158
|
+
i += 1
|
159
|
+
c1, c2 = row.split(/\t/).map(&:strip)
|
160
|
+
|
161
|
+
if c1.nil? || c2.nil?
|
162
|
+
puts
|
163
|
+
next
|
164
|
+
end
|
165
|
+
|
166
|
+
# the conversion
|
167
|
+
if opt[:translate_to] == :id
|
168
|
+
if c1 =~ /.*\:.*/ # it's an id, leave it
|
169
|
+
v1 = c1
|
170
|
+
else
|
171
|
+
v1 = c1obo.term_hash[c1]
|
172
|
+
end
|
173
|
+
if c2 =~ /.*\:.*/
|
174
|
+
v2 = c2
|
175
|
+
else
|
176
|
+
v2 = c2obo.term_hash[c2]
|
177
|
+
end
|
178
|
+
else
|
179
|
+
if c1 =~ /.*\:.*/
|
180
|
+
v1 = c1obo.id_hash[c1]
|
181
|
+
else
|
182
|
+
v1 = c1
|
183
|
+
end
|
184
|
+
if c2 =~ /.*\:.*/
|
185
|
+
v2 = c2obo.id_hash[c2]
|
186
|
+
else
|
187
|
+
v2 = c2
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
case opt[:output]
|
192
|
+
when :cols
|
193
|
+
puts "#{v1}\t#{v2}"
|
194
|
+
when :xls
|
195
|
+
sheet[i,0] = v1
|
196
|
+
sheet[i,1] = OboParser::Utilities.term_stanza_from_file(v1, opt[:col1_obo])
|
197
|
+
sheet[i,2] = v2
|
198
|
+
sheet[i,3] = OboParser::Utilities.term_stanza_from_file(v2, opt[:col2_obo])
|
199
|
+
when :homolonto
|
200
|
+
s << OboParser::Utilities.homolonto_stanza(i, c1obo.id_hash[v1] , v1, v2) # "#{c1obo.id_hash[v1]} ! #{c2obo.id_hash[v2]}"
|
201
|
+
s << "\n\n"
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
case opt[:output]
|
206
|
+
when :xls
|
207
|
+
book.write "#{opt[:output_filename]}.xls"
|
208
|
+
when :homolonto
|
209
|
+
puts s + "\n"
|
210
|
+
end
|
211
|
+
|
212
|
+
true
|
213
|
+
end
|
214
|
+
|
215
|
+
# Returns a HomolOnto Stanza
|
216
|
+
#
|
217
|
+
# @param [String] id an externally tracked id for the id: tag like '00001'
|
218
|
+
# @param [String] name a name for the name: tag
|
219
|
+
# @param [Array] members a Array of 2 or more members for the relationship: has_member tag like ['FOO:123', 'BAR:456']
|
220
|
+
# @return [String] the stanza requested
|
221
|
+
def self.homolonto_stanza(id, name, *members)
|
222
|
+
return 'NOT ENOUGH RELATIONSHIPS' if members.length < 2
|
223
|
+
s = []
|
224
|
+
s << '[Term]'
|
225
|
+
s << "id: HOG:#{id}"
|
226
|
+
s << "name: #{name}"
|
227
|
+
members.each do |m|
|
228
|
+
s << "relationship: has_member #{m}"
|
229
|
+
end
|
230
|
+
s.join("\n")
|
231
|
+
end
|
232
|
+
|
233
|
+
#== Helper methods that don't require the obo_parser library
|
234
|
+
|
235
|
+
# Given a Term id and a String representing an OBO file returns that stanza.
|
236
|
+
#
|
237
|
+
# @param [String] id a Term id like 'FOO:123'
|
238
|
+
# @param [String] file a Obo file as a String like File.read('my.obo')
|
239
|
+
# @return [String] the stanza requested
|
240
|
+
def self.term_stanza_from_file(id, file)
|
241
|
+
foo = ""
|
242
|
+
file =~ /(^\[Term\]\s*?id:\s*?#{id}.*?)(^\[Term\]|^\[Typedef\])/im
|
243
|
+
foo = $1 if !$1.nil?
|
244
|
+
foo.gsub(/\n\r/,"\n")
|
245
|
+
end
|
246
|
+
|
107
247
|
end
|
data/obo_parser.gemspec
CHANGED
@@ -1,51 +1,49 @@
|
|
1
1
|
# Generated by jeweler
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{obo_parser}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["mjy"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-06-09}
|
13
13
|
s.description = %q{Provides all-in-one object containing the contents of an OBO formatted file. OBO version 1.2 is targeted, though this should work for 1.0. }
|
14
14
|
s.email = %q{diapriid@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE",
|
17
|
-
|
17
|
+
"README.rdoc"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
21
|
+
"LICENSE",
|
22
|
+
"README.rdoc",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"init.rb",
|
26
|
+
"install.rb",
|
27
|
+
"lib/lexer.rb",
|
28
|
+
"lib/obo_parser.rb",
|
29
|
+
"lib/parser.rb",
|
30
|
+
"lib/tokens.rb",
|
31
|
+
"lib/utilities.rb",
|
32
|
+
"obo_parser.gemspec",
|
33
|
+
"tasks/obo_parser_tasks.rake",
|
34
|
+
"test/cell.obo",
|
35
|
+
"test/go.obo",
|
36
|
+
"test/hao.obo",
|
37
|
+
"test/obo_1.0_test.txt",
|
38
|
+
"test/obo_1.0_test_wo_typedefs.txt",
|
39
|
+
"test/test_obo_parser.rb",
|
40
|
+
"test/tgma.obo",
|
41
|
+
"uninstall.rb"
|
40
42
|
]
|
41
43
|
s.homepage = %q{http://github.com/mjy/obo_parser}
|
42
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
43
44
|
s.require_paths = ["lib"]
|
44
|
-
s.rubygems_version = %q{1.
|
45
|
+
s.rubygems_version = %q{1.7.2}
|
45
46
|
s.summary = %q{A simple OBO file handler.}
|
46
|
-
s.test_files = [
|
47
|
-
"test/test_obo_parser.rb"
|
48
|
-
]
|
49
47
|
|
50
48
|
if s.respond_to? :specification_version then
|
51
49
|
s.specification_version = 3
|
data/test/cell.obo
CHANGED
@@ -4365,7 +4365,7 @@ is_a: CL:0000255 ! eukaryotic cell
|
|
4365
4365
|
[Term]
|
4366
4366
|
id: CL:0000611
|
4367
4367
|
name: eosinophil progenitor cell
|
4368
|
-
comment: These cells are CD34-positive, CD45RA-negative, CD71-negative, and lineage-negative (CD2, CD3 epsilon, CD4, CD5, CD8a, CD14, CD19, CD20, integrin alpha-M, NCAM-1, SCA
|
4368
|
+
comment: These cells are CD34-positive, CD45RA-negative, CD71-negative, and lineage-negative (CD2, CD3 epsilon, CD4, CD5, CD8a, CD14, CD19, CD20, integrin alpha-M, NCAM-1, SCA¿1, Ly6G, Ly76).
|
4369
4369
|
synonym: "CFU-Eo" RELATED []
|
4370
4370
|
synonym: "colony forming unit eosinophil" RELATED []
|
4371
4371
|
synonym: "EoP" EXACT [PMID:15955840, PMID:19114669]
|