semtools 0.1.6 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 83746b4834f16f9bffc404c578be30e427bf90c7f43ba1f55bd04d94a29186c5
4
- data.tar.gz: 25f0bd67c733d4289f2e790bc857886a184dd95677beaf378da9074957660e69
3
+ metadata.gz: a3f63cc6548a9938e31121d2018d1c1c477987007c5d253b5fa814a285bdb576
4
+ data.tar.gz: e1911d3157c3046590ca13bc86215d2260b4a8b2b1b25affa5c2673881036795
5
5
  SHA512:
6
- metadata.gz: 602fd8d61f9e9f34c2de957dd4d311a510413eea2bb0d1794b4d2da6f4e8959d3919dd8e9ea4a0d9f5d4e962443b7f50675e075d385830aeb9ef08ecb38d3fe2
7
- data.tar.gz: 5b9f66a1fef9c3296e5fe203330e3575e4fecf0f363fc8b884abd56e8fbcb1bd5dac3563792a4a5da908bcee409e2d36596751123aa2012d5cc9a8a5ff3c7796
6
+ metadata.gz: 30c95df80957a4a35b6fea05b9552352f529d8e45c10f6b128924a3ce2ee5d90e92a1e9d5fe0016d25538147e12d3a9199c81222642c94cdd0eb3c89eea168ef
7
+ data.tar.gz: ddc9e600fd984e68d060b7be05adf27b3f20bb67e638d42acc4b9b156eedabfce20d6f588a03d1fbc2948fedbd80d498f1767c0e3f8ea03720fa0ca327b95f3c
data/Gemfile CHANGED
@@ -5,5 +5,8 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
5
5
  # Specify your gem's dependencies in semtools.gemspec
6
6
  gemspec
7
7
 
8
- gem "rake", "~> 12.0"
8
+ gem "rake", "~> 13.0"
9
9
  gem "minitest", "~> 5.0"
10
+
11
+ expcalc_dev_path = File.expand_path('~/dev_gems/expcalc')
12
+ gem "expcalc", github: "seoanezonjic/expcalc", branch: "master" if Dir.exist?(expcalc_dev_path)
data/bin/semtools.rb ADDED
@@ -0,0 +1,446 @@
1
+ #! /usr/bin/env ruby
2
+ ROOT_PATH = File.dirname(__FILE__)
3
+ $LOAD_PATH.unshift(File.expand_path(File.join(ROOT_PATH, '..', 'lib')))
4
+ EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
5
+
6
+ require 'optparse'
7
+ require 'down'
8
+ require 'semtools'
9
+
10
+ ######################################################################################
11
+ ## METHODS
12
+ ######################################################################################
13
+ def load_tabular_file(file)
14
+ records = []
15
+ File.open(file).each do |line|
16
+ line.chomp!
17
+ fields = line.split("\t")
18
+ records << fields
19
+ end
20
+ return records
21
+ end
22
+
23
+ def store_profiles(file, ontology)
24
+ file.each do |id, terms|
25
+ ontology.add_profile(id, terms)
26
+ end
27
+ end
28
+
29
+ def load_value(hash_to_load, key, value, unique = true)
30
+ query = hash_to_load[key]
31
+ if query.nil?
32
+ value = [value] if value.class != Array
33
+ hash_to_load[key] = value
34
+ else
35
+ if value.class == Array
36
+ query.concat(value)
37
+ else
38
+ query << value
39
+ end
40
+ query.uniq! unless unique == nil
41
+ end
42
+ end
43
+
44
+ def translate(ontology, type, options, profiles = nil)
45
+ not_translated = {}
46
+ if type == 'names'
47
+ ontology.profiles.each do |id, terms|
48
+ translation, untranslated = ontology.translate_ids(terms)
49
+ ontology.profiles[id] = translation
50
+ not_translated[id] = untranslated unless untranslated.empty?
51
+ end
52
+ elsif type == 'codes'
53
+ profiles.each do |id,terms|
54
+ translation, untranslated = ontology.translate_names(terms)
55
+ profiles[id] = translation
56
+ profiles[id] = profiles[id].join("#{options[:separator]}")
57
+ not_translated[id] = untranslated unless untranslated.empty?
58
+ end
59
+ end
60
+ if !not_translated.empty?
61
+ File.open(options[:untranslated_path], 'w') do |file|
62
+ not_translated.each do |id, terms|
63
+ file.puts([id, terms.join(";")].join("\t"))
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ def clean_profile(profile, ontology, options)
70
+ cleaned_profile = ontology.clean_profile_hard(profile)
71
+ unless options[:term_filter].nil?
72
+ cleaned_profile.select! {|term| ontology.get_ancestors(term).include?(options[:term_filter])}
73
+ end
74
+ return cleaned_profile
75
+ end
76
+
77
+ def clean_profiles(profiles, ontology, options)
78
+ removed_profiles = []
79
+ profiles.each do |id, terms|
80
+ cleaned_profile = clean_profile(terms, ontology, options)
81
+ profiles[id] = cleaned_profile
82
+ removed_profiles << id if cleaned_profile.empty?
83
+ end
84
+ removed_profiles.each{|rp| profiles.delete(rp)}
85
+ return removed_profiles
86
+ end
87
+
88
+ def expand_profiles(profiles, ontology, unwanted_terms = [])
89
+ profiles.each do |disease_id, terms|
90
+ terms.each do |term|
91
+ profiles[disease_id] << ontology.get_ancestors(term).difference(unwanted_terms)
92
+ end
93
+ end
94
+ end
95
+
96
+ def write_similarity_profile_list(input, onto_obj, similarity_type)
97
+ similarity_file = File.basename(input, ".*")+'_semantic_similarity_list'
98
+ File.open(similarity_file, 'w') do |file|
99
+ onto_obj.profiles.each do |profile_query_key, profile_query_value|
100
+ onto_obj.profiles.each do |profile_search_key, profile_search_value|
101
+ file.puts([profile_query_key, profile_search_key, onto_obj.compare(profile_query_value, profile_search_value, sim_type: similarity_type)].join("\t"))
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ def download(source, key, output)
108
+ source_list = load_tabular_file(source).to_h
109
+ external_data = File.dirname(source)
110
+ if key == 'list'
111
+ Dir.glob(File.join(external_data,'*.obo')){|f| puts f}
112
+ else
113
+ url = source_list[key]
114
+ if !output.nil?
115
+ output_path = output
116
+ else
117
+ file_name = key + '.obo'
118
+ if File.writable?(external_data)
119
+ output_path = File.join(external_data, file_name)
120
+ else
121
+ output_path = file_name
122
+ end
123
+ end
124
+ if !url.nil?
125
+ Down::NetHttp.download(url, destination: output_path, max_redirects: 5)
126
+ File.chmod(0644, output_path) # Correct file permissions set by down gem
127
+ end
128
+ end
129
+ end
130
+
131
+ def get_ontology_file(path, source)
132
+ if !File.exists?(path)
133
+ ont_index = load_tabular_file(source).to_h
134
+ if !ont_index[path].nil?
135
+ path = File.join(File.dirname(source), path + '.obo')
136
+ else
137
+ abort("Input ontology file not exists")
138
+ end
139
+ end
140
+ return path
141
+ end
142
+
143
+ def get_stats(stats)
144
+ report_stats = []
145
+ report_stats << ['Elements', stats[:count]]
146
+ report_stats << ['Elements Non Zero', stats[:countNonZero]]
147
+ report_stats << ['Non Zero Density', stats[:countNonZero].fdiv(stats[:count])]
148
+ report_stats << ['Max', stats[:max]]
149
+ report_stats << ['Min', stats[:min]]
150
+ report_stats << ['Average', stats[:average]]
151
+ report_stats << ['Variance', stats[:variance]]
152
+ report_stats << ['Standard Deviation', stats[:standardDeviation]]
153
+ report_stats << ['Q1', stats[:q1]]
154
+ report_stats << ['Median', stats[:median]]
155
+ report_stats << ['Q3', stats[:q3]]
156
+ return report_stats
157
+ end
158
+
159
+
160
+
161
+
162
+
163
+
164
+ ####################################################################################
165
+ ## OPTPARSE
166
+ ####################################################################################
167
+ options = {}
168
+ OptionParser.new do |opts|
169
+ opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
170
+
171
+ options[:download] = nil
172
+ opts.on("-d", "--download STRING", "Download obo file from official resource. MONDO, GO and HPO are possible values.") do |item|
173
+ options[:download] = item
174
+ end
175
+
176
+ options[:input_file] = nil
177
+ opts.on("-i", "--input_file PATH", "Filepath of profile data") do |item|
178
+ options[:input_file] = item
179
+ end
180
+
181
+ options[:output_file] = nil
182
+ opts.on("-o", "--output_file PATH", "Output filepath") do |item|
183
+ options[:output_file] = item
184
+ end
185
+
186
+ options[:IC] = false
187
+ opts.on("-I", "--IC", "Get IC") do
188
+ options[:IC] = true
189
+ end
190
+
191
+ options[:ontology_file] = nil
192
+ opts.on("-O PATH", "--ontology_file PATH", "Path to ontology file") do |item|
193
+ options[:ontology_file] = item
194
+ end
195
+
196
+ options[:term_filter] = nil
197
+ opts.on("-T STRING", "--term_filter STRING", "If specified, only terms that are descendants of the specified term will be kept on a profile when cleaned") do |item|
198
+ options[:term_filter] = item.to_sym
199
+ end
200
+
201
+ options[:translate] = nil
202
+ opts.on("-t STRING", "--translate STRING", "Translate to 'names' or to 'codes'") do |item|
203
+ options[:translate] = item
204
+ end
205
+
206
+ opts.on("-s method", "--similarity method", "Calculate similarity between profile IDs computed by 'resnik', 'lin' or 'jiang_conrath' methods. ") do |sim_method|
207
+ options[:similarity] = sim_method.to_sym
208
+ end
209
+
210
+ options[:clean_profiles] = false
211
+ opts.on("-c", "--clean_profiles", "Removes ancestors, descendants and obsolete terms from profiles") do
212
+ options[:clean_profiles] = true
213
+ end
214
+
215
+ options[:removed_path] = 'rejected_profs'
216
+ opts.on("-r PATH", "--removed_path PATH", "Desired path to write removed profiles file") do |item|
217
+ options[:removed_path] = item
218
+ end
219
+
220
+ options[:untranslated_path] = nil
221
+ opts.on("-u PATH", "--untranslated_path PATH", "Desired path to write untranslated terms file") do |item|
222
+ options[:untranslated_path] = item
223
+ end
224
+
225
+ options[:keyword] = nil
226
+ opts.on("-k STRING", "--keyword STRING", "regex used to get xref terms in the ontology file") do |item|
227
+ options[:keyword] = item
228
+ end
229
+
230
+ options[:xref_sense] = :byValue
231
+ opts.on("--xref_sense ", "Ontology-xref or xref-ontology. By default xref-ontology if set, ontology-xref") do
232
+ options[:xref_sense] = :byTerm
233
+ end
234
+
235
+ options[:expand_profiles] = false
236
+ opts.on("-e", "--expand_profiles", "Expand profiles adding ancestors") do
237
+ options[:expand_profiles] = true
238
+ end
239
+
240
+ options[:unwanted_terms] = []
241
+ opts.on("-U", "--unwanted_terms STRING", "Comma separated terms not wanted to be included in profile expansion") do |item|
242
+ options[:unwanted_terms] = item
243
+ end
244
+
245
+ options[:separator] = ";"
246
+ opts.on("-S STRING", "--separator STRING", "Separator used for the terms profile") do |sep|
247
+ options[:separator] = sep
248
+ end
249
+
250
+ options[:childs] = [[], '']
251
+ opts.on("-C STRING", "--childs STRING", "Term code list (comma separated) to generate child list") do |item|
252
+ if item.include?('/')
253
+ modifiers, terms = item.split('/')
254
+ else
255
+ modifiers = ''
256
+ terms = item
257
+ end
258
+ terms = terms.split(',').map{|t| t.to_sym}
259
+ options[:childs] = [terms, modifiers]
260
+ end
261
+
262
+ options[:statistics] = false
263
+ opts.on("-n", "--statistics", "To obtain main statistical descriptors of the profiles file") do
264
+ options[:statistics] = true
265
+ end
266
+
267
+ options[:list_translate] = nil
268
+ opts.on("-l STRING", "--list_translate STRING", "Translate to 'names' or to 'codes' input list") do |sep|
269
+ options[:list_translate] = sep
270
+ end
271
+
272
+ options[:subject_column] = 0
273
+ opts.on("-f NUM", "--subject_column NUM", "The number of the column for the subject id") do |ncol|
274
+ options[:subject_column] = ncol.to_i
275
+ end
276
+
277
+ options[:annotations_column] = 1
278
+ opts.on("-a NUM", "--annotations_column NUM", "The number of the column for the annotation ids") do |ncol|
279
+ options[:annotations_column] = ncol.to_i
280
+ end
281
+
282
+
283
+ options[:list_term_attributes] = false
284
+ opts.on("--list_term_attributes", "The number of the column for the annotation ids") do
285
+ options[:list_term_attributes] = true
286
+ end
287
+
288
+ end.parse!
289
+
290
+ ####################################################################################
291
+ ## MAIN
292
+ ####################################################################################
293
+ ont_index_file = File.join(EXTERNAL_DATA, 'ontologies.txt')
294
+ if !options[:download].nil?
295
+ download(ont_index_file, options[:download], options[:output_file])
296
+ Process.exit
297
+ end
298
+
299
+ if !options[:ontology_file].nil?
300
+ options[:ontology_file] = get_ontology_file(options[:ontology_file], ont_index_file)
301
+ end
302
+ ontology = Ontology.new(file: options[:ontology_file], load_file: true)
303
+
304
+ if !options[:input_file].nil?
305
+ data = load_tabular_file(options[:input_file])
306
+ if options[:list_translate].nil? || !options[:keyword].nil?
307
+ data.map!{|row|
308
+ [row[options[:subject_column]],
309
+ row[options[:annotations_column]].split(options[:separator]).map!{|term| term.to_sym}]
310
+ }
311
+ store_profiles(data, ontology) if options[:translate] != 'codes' && options[:keyword].nil?
312
+ end
313
+ end
314
+
315
+ if !options[:list_translate].nil?
316
+ data.each do |term|
317
+ if options[:list_translate] == 'names'
318
+ translation, untranslated = ontology.translate_ids(term)
319
+ elsif options[:list_translate] == 'codes'
320
+ translation, untranslated = ontology.translate_names(term)
321
+ end
322
+ puts "#{term.first}\t#{translation.empty? ? '-' : translation.first}"
323
+ end
324
+ Process.exit
325
+ end
326
+
327
+ if options[:translate] == 'codes'
328
+ profiles = {}
329
+ data.each do |id, terms|
330
+ load_value(profiles, id, terms)
331
+ profiles[id] = terms.split(options[:separator])
332
+ end
333
+ translate(ontology, 'codes', options, profiles)
334
+ store_profiles(profiles, ontology)
335
+ end
336
+
337
+ if options[:clean_profiles]
338
+ removed_profiles = clean_profiles(ontology.profiles, ontology, options)
339
+ if !removed_profiles.nil? && !removed_profiles.empty?
340
+ File.open(options[:removed_path], 'w') do |f|
341
+ removed_profiles.each do |profile|
342
+ f.puts profile
343
+ end
344
+ end
345
+ end
346
+ end
347
+
348
+ if options[:expand_profiles]
349
+ expanded_profiles = expand_profiles(ontology.profiles, ontology, options[:unwanted_terms])
350
+ end
351
+
352
+ if !options[:similarity].nil?
353
+ write_similarity_profile_list(input = options[:input_file], onto_obj=ontology, similarity_type = options[:similarity])
354
+ end
355
+
356
+
357
+ if options[:IC]
358
+ ontology.add_observed_terms_from_profiles
359
+ by_ontology, by_freq = ontology.get_profiles_resnik_dual_ICs
360
+ ic_file = File.basename(options[:input_file], ".*")+'_IC_onto_freq'
361
+ File.open(ic_file , 'w') do |file|
362
+ ontology.profiles.keys.each do |id|
363
+ file.puts([id, by_ontology[id], by_freq[id]].join("\t"))
364
+ end
365
+ end
366
+ end
367
+
368
+ if options[:translate] == 'names'
369
+ translate(ontology, 'names', options)
370
+ end
371
+
372
+ if !options[:childs].first.empty?
373
+ terms, modifiers = options[:childs]
374
+ all_childs = []
375
+ terms.each do |term|
376
+ childs = ontology.get_descendants(term)
377
+ all_childs = all_childs | childs
378
+ end
379
+ if modifiers.include?('r')
380
+ relations = []
381
+ all_childs = all_childs | terms # Add parents that generated child list
382
+ all_childs.each do |term|
383
+ descendants = ontology.get_direct_descendants(term)
384
+ if !descendants.nil?
385
+ descendants.each do |desc|
386
+ relations << [term, desc]
387
+ end
388
+ end
389
+ end
390
+ relations.each do |rel|
391
+ rel, _ = ontology.translate_ids(rel) if modifiers.include?('n')
392
+ puts rel.join("\t")
393
+ end
394
+ else
395
+ all_childs.each do |c|
396
+ if modifiers.include?('n')
397
+ puts ontology.translate_id(c)
398
+ else
399
+ puts c
400
+ end
401
+ end
402
+ end
403
+ end
404
+
405
+ if !options[:output_file].nil?
406
+ File.open(options[:output_file], 'w') do |file|
407
+ ontology.profiles.each do |id, terms|
408
+ file.puts([id, terms.join("|")].join("\t"))
409
+ end
410
+ end
411
+ end
412
+
413
+ if options[:statistics]
414
+ get_stats(ontology.profile_stats).each do |stat|
415
+ puts stat.join("\t")
416
+ end
417
+ end
418
+
419
+ if options[:list_term_attributes]
420
+ term_attributes = ontology.list_term_attributes
421
+ term_attributes.each do |t_attr|
422
+ t_attr[0] = t_attr[0].to_s
423
+ puts t_attr.join("\t")
424
+ end
425
+ end
426
+
427
+ if !options[:keyword].nil?
428
+ xref_translated = []
429
+ ontology.calc_dictionary(:xref, select_regex: /(#{options[:keyword]})/, store_tag: :tag, multiterm: true, substitute_alternatives: false)
430
+ dict = ontology.dicts[:tag][options[:xref_sense]]
431
+ data.each do |id, prof|
432
+ xrefs = []
433
+ prof.each do |t|
434
+ query = dict[t.to_s]
435
+ xrefs.concat(query) if !query.nil?
436
+ end
437
+ xref_translated << [id, xrefs] if !xrefs.empty?
438
+ end
439
+ File.open(options[:output_file], 'w') do |f|
440
+ xref_translated.each do |id, prof|
441
+ prof.each do |t|
442
+ f.puts [id, t].join("\t")
443
+ end
444
+ end
445
+ end
446
+ end
data/bin/strsimnet.rb CHANGED
@@ -111,12 +111,11 @@ texts2compare = load_table_file(input_file = options[:input_file],
111
111
  targetCol = options[:cindex],
112
112
  filterCol = options[:findex],
113
113
  filterValue = options[:filter_value])
114
-
115
114
  # Verbose point
116
115
  puts "Calculating similitude for (" + texts2compare.length.to_s + ") elements"
117
116
 
118
117
  # Obtain all Vs all
119
- similitudes_AllVsAll = similitude_network(texts2compare,options[:rm_char])
118
+ similitudes_AllVsAll = similitude_network(texts2compare, charsToRemove: options[:rm_char])
120
119
 
121
120
  # Verbose point
122
121
  puts "Writing output file ..."
@@ -0,0 +1,4 @@
1
+ GO http://purl.obolibrary.org/obo/go/go-basic.obo
2
+ HPO http://purl.obolibrary.org/obo/hp.obo
3
+ MONDO http://purl.obolibrary.org/obo/mondo.obo
4
+ EFO http://www.ebi.ac.uk/efo/efo.obo
@@ -1,148 +1,148 @@
1
- # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
2
- #to cmpute fisher exact test
3
- #Fisher => http://www.biostathandbook.com/fishers.html
4
- def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
5
- #puts '-', listA.inspect, listB.inspect, '-'
6
- listA_listB = listA & listB
7
- listA_nolistB = listA - listB
8
- nolistA_listB = listB - listA
9
- if weigths.nil?
10
- listA_listB_count = listA_listB.length
11
- listA_nolistB_count = listA_nolistB.length
12
- nolistA_listB_count = nolistA_listB.length
13
- nolistA_nolistB_count = all_elements_count - (listA | listB).length
14
- else
15
- # Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
16
- # https://academic.oup.com/bioinformatics/article/22/13/1600/193669
17
- listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
18
- listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
19
- nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
1
+ # # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
2
+ # #to cmpute fisher exact test
3
+ # #Fisher => http://www.biostathandbook.com/fishers.html
4
+ # def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
5
+ # #puts '-', listA.inspect, listB.inspect, '-'
6
+ # listA_listB = listA & listB
7
+ # listA_nolistB = listA - listB
8
+ # nolistA_listB = listB - listA
9
+ # if weigths.nil?
10
+ # listA_listB_count = listA_listB.length
11
+ # listA_nolistB_count = listA_nolistB.length
12
+ # nolistA_listB_count = nolistA_listB.length
13
+ # nolistA_nolistB_count = all_elements_count - (listA | listB).length
14
+ # else
15
+ # # Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
16
+ # # https://academic.oup.com/bioinformatics/article/22/13/1600/193669
17
+ # listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
18
+ # listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
19
+ # nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
20
20
 
21
- if partial_weigths
22
- nolistA_nolistB_count = all_elements_count - (listA | listB).length
23
- all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
24
- else
25
- nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
26
- all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
27
- end
28
- end
29
- #puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
30
- if tail == 'two_sided'
31
- accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
32
- elsif tail == 'less'
33
- accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
34
- end
35
- return accumulated_prob
36
- end
21
+ # if partial_weigths
22
+ # nolistA_nolistB_count = all_elements_count - (listA | listB).length
23
+ # all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
24
+ # else
25
+ # nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
26
+ # all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
27
+ # end
28
+ # end
29
+ # #puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
30
+ # if tail == 'two_sided'
31
+ # accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
32
+ # elsif tail == 'less'
33
+ # accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
34
+ # end
35
+ # return accumulated_prob
36
+ # end
37
37
 
38
- def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
39
- #https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
40
- accumulated_prob = 0
41
- ref_prob = compute_hyper_prob(
42
- listA_listB_count,
43
- listA_nolistB_count,
44
- nolistA_listB_count,
45
- nolistA_nolistB_count,
46
- all_elements_count
47
- )
48
- accumulated_prob += ref_prob
49
- [listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
50
- n += 1
51
- prob = compute_hyper_prob(
52
- listA_listB_count - n,
53
- listA_nolistB_count + n,
54
- nolistA_listB_count + n,
55
- nolistA_nolistB_count - n,
56
- all_elements_count
57
- )
58
- prob <= ref_prob ? accumulated_prob += prob : break
59
- end
38
+ # def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
39
+ # #https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
40
+ # accumulated_prob = 0
41
+ # ref_prob = compute_hyper_prob(
42
+ # listA_listB_count,
43
+ # listA_nolistB_count,
44
+ # nolistA_listB_count,
45
+ # nolistA_nolistB_count,
46
+ # all_elements_count
47
+ # )
48
+ # accumulated_prob += ref_prob
49
+ # [listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
50
+ # n += 1
51
+ # prob = compute_hyper_prob(
52
+ # listA_listB_count - n,
53
+ # listA_nolistB_count + n,
54
+ # nolistA_listB_count + n,
55
+ # nolistA_nolistB_count - n,
56
+ # all_elements_count
57
+ # )
58
+ # prob <= ref_prob ? accumulated_prob += prob : break
59
+ # end
60
60
 
61
- [listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
62
- n += 1
63
- prob = compute_hyper_prob(
64
- listA_listB_count + n,
65
- listA_nolistB_count - n,
66
- nolistA_listB_count - n,
67
- nolistA_nolistB_count + n,
68
- all_elements_count
69
- )
70
- accumulated_prob += prob if prob <= ref_prob
71
- end
61
+ # [listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
62
+ # n += 1
63
+ # prob = compute_hyper_prob(
64
+ # listA_listB_count + n,
65
+ # listA_nolistB_count - n,
66
+ # nolistA_listB_count - n,
67
+ # nolistA_nolistB_count + n,
68
+ # all_elements_count
69
+ # )
70
+ # accumulated_prob += prob if prob <= ref_prob
71
+ # end
72
72
 
73
- return accumulated_prob
74
- end
73
+ # return accumulated_prob
74
+ # end
75
75
 
76
- def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
77
- accumulated_prob = 0
78
- [listA_listB_count, nolistA_nolistB_count].min.times do |n|
79
- accumulated_prob += compute_hyper_prob(
80
- listA_listB_count - n,
81
- listA_nolistB_count + n,
82
- nolistA_listB_count + n,
83
- nolistA_nolistB_count - n,
84
- all_elements_count
85
- )
86
- end
87
- return accumulated_prob
88
- end
76
+ # def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
77
+ # accumulated_prob = 0
78
+ # [listA_listB_count, nolistA_nolistB_count].min.times do |n|
79
+ # accumulated_prob += compute_hyper_prob(
80
+ # listA_listB_count - n,
81
+ # listA_nolistB_count + n,
82
+ # nolistA_listB_count + n,
83
+ # nolistA_nolistB_count - n,
84
+ # all_elements_count
85
+ # )
86
+ # end
87
+ # return accumulated_prob
88
+ # end
89
89
 
90
- def compute_hyper_prob(a, b, c, d, n)
91
- # https://en.wikipedia.org/wiki/Fisher%27s_exact_test
92
- binomA = binom(a + b, a)
93
- binomC = binom(c + d, c)
94
- divisor = binom(n, a + c)
95
- return (binomA * binomC).fdiv(divisor)
96
- end
90
+ # def compute_hyper_prob(a, b, c, d, n)
91
+ # # https://en.wikipedia.org/wiki/Fisher%27s_exact_test
92
+ # binomA = binom(a + b, a)
93
+ # binomC = binom(c + d, c)
94
+ # divisor = binom(n, a + c)
95
+ # return (binomA * binomC).fdiv(divisor)
96
+ # end
97
97
 
98
- def binom(n,k)
99
- if k > 0 && k < n
100
- res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
101
- else
102
- res = 1
103
- end
104
- end
98
+ # def binom(n,k)
99
+ # if k > 0 && k < n
100
+ # res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
101
+ # else
102
+ # res = 1
103
+ # end
104
+ # end
105
105
 
106
- #to cmpute adjusted pvalues
107
- #https://rosettacode.org/wiki/P-value_correction#Ruby
108
- def get_benjaminiHochberg_pvalues(arr_pvalues)
109
- n = arr_pvalues.length
110
- arr_o = order(arr_pvalues, true)
111
- arr_cummin_input = []
112
- (0..(n - 1)).each do |i|
113
- arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
114
- end
115
- arr_ro = order(arr_o)
116
- arr_cummin = cummin(arr_cummin_input)
117
- arr_pmin = pmin(arr_cummin)
118
- return arr_pmin.values_at(*arr_ro)
119
- end
106
+ # #to cmpute adjusted pvalues
107
+ # #https://rosettacode.org/wiki/P-value_correction#Ruby
108
+ # def get_benjaminiHochberg_pvalues(arr_pvalues)
109
+ # n = arr_pvalues.length
110
+ # arr_o = order(arr_pvalues, true)
111
+ # arr_cummin_input = []
112
+ # (0..(n - 1)).each do |i|
113
+ # arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
114
+ # end
115
+ # arr_ro = order(arr_o)
116
+ # arr_cummin = cummin(arr_cummin_input)
117
+ # arr_pmin = pmin(arr_cummin)
118
+ # return arr_pmin.values_at(*arr_ro)
119
+ # end
120
120
 
121
- def order(array, decreasing = false)
122
- if decreasing == false
123
- array.sort.map { |n| array.index(n) }
124
- else
125
- array.sort.map { |n| array.index(n) }.reverse
126
- end
127
- end
121
+ # def order(array, decreasing = false)
122
+ # if decreasing == false
123
+ # array.sort.map { |n| array.index(n) }
124
+ # else
125
+ # array.sort.map { |n| array.index(n) }.reverse
126
+ # end
127
+ # end
128
128
 
129
- def cummin(array)
130
- cumulative_min = array.first
131
- arr_cummin = []
132
- array.each do |p|
133
- cumulative_min = [p, cumulative_min].min
134
- arr_cummin << cumulative_min
135
- end
136
- return arr_cummin
137
- end
129
+ # def cummin(array)
130
+ # cumulative_min = array.first
131
+ # arr_cummin = []
132
+ # array.each do |p|
133
+ # cumulative_min = [p, cumulative_min].min
134
+ # arr_cummin << cumulative_min
135
+ # end
136
+ # return arr_cummin
137
+ # end
138
138
 
139
- def pmin(array)
140
- x = 1
141
- pmin_array = []
142
- array.each_index do |i|
143
- pmin_array[i] = [array[i], x].min
144
- abort if pmin_array[i] > 1
145
- end
146
- return pmin_array
147
- end
139
+ # def pmin(array)
140
+ # x = 1
141
+ # pmin_array = []
142
+ # array.each_index do |i|
143
+ # pmin_array[i] = [array[i], x].min
144
+ # abort if pmin_array[i] > 1
145
+ # end
146
+ # return pmin_array
147
+ # end
148
148
 
@@ -1,3 +1,4 @@
1
+ require 'expcalc'
1
2
  require 'json'
2
3
  require 'colorize'
3
4
 
@@ -45,7 +46,7 @@ class Ontology
45
46
  @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
46
47
  @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
47
48
  @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
48
-
49
+
49
50
  #############################################
50
51
  # CONSTRUCTOR
51
52
  #############################################
@@ -202,6 +203,7 @@ class Ontology
202
203
  # Only TERMS multivalue tags (future add Typedefs and Instance)
203
204
  # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
204
205
  attributes.each do |tag, value|
206
+ value.gsub!(/{source=[\\\":A-Za-z0-9\/\.\-, =]+} /, '') if tag == 'is_a' # To delete "source" attributes in is_a tag of MONDO ontology
205
207
  # Check
206
208
  raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
207
209
  # Prepare
@@ -553,14 +555,14 @@ class Ontology
553
555
  self.get_index_obsoletes
554
556
  self.get_index_alternatives
555
557
  self.get_index_child_parent_relations
556
- @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
558
+ @alternatives_index.each{|k,v| @alternatives_index[k] = self.extract_id(v)}
557
559
  ## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
558
560
  @alternatives_index.compact!
559
- @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
561
+ @obsoletes_index.each{|k,v| @obsoletes_index[k] = self.extract_id(v)}
560
562
  @obsoletes_index.compact!
561
- @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
563
+ @ancestors_index.each{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
562
564
  @ancestors_index.compact!
563
- @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
565
+ @descendants_index.each{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
564
566
  @descendants_index.compact!
565
567
  self.get_index_frequencies
566
568
  self.calc_dictionary(:name)
@@ -721,7 +723,7 @@ class Ontology
721
723
  # an array with all ancestors/descendants of given term or nil if parents are not available yet
722
724
  def get_familiar(term, return_ancestors = true, filter_alternatives = false)
723
725
  # Find into parentals
724
- familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
726
+ familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
725
727
  if !familiars.nil?
726
728
  familiars = familiars.clone
727
729
  if filter_alternatives
@@ -1580,9 +1582,12 @@ class Ontology
1580
1582
  return terms_without_ancestors_and_alternatices
1581
1583
  end
1582
1584
 
1583
- def clean_profile_hard(profile)
1585
+ def clean_profile_hard(profile, options = {})
1584
1586
  profile, _ = check_ids(profile)
1585
1587
  profile = profile.select{|t| !is_obsolete?(t)}
1588
+ if !options[:term_filter].nil?
1589
+ profile.select! {|term| get_ancestors(term).include?(options[:term_filter])}
1590
+ end
1586
1591
  profile = clean_profile(profile.uniq)
1587
1592
  return profile
1588
1593
  end
@@ -1642,6 +1647,27 @@ class Ontology
1642
1647
  end
1643
1648
 
1644
1649
 
1650
+ def get_profile_redundancy()
1651
+ profile_sizes = self.get_profiles_sizes
1652
+ parental_terms_per_profile = self.parentals_per_profile# clean_profiles
1653
+ parental_terms_per_profile = parental_terms_per_profile.map{|item| item[0]}
1654
+ profile_sizes, parental_terms_per_profile = profile_sizes.zip(parental_terms_per_profile).sort_by{|i| i.first}.reverse.transpose
1655
+ return profile_sizes, parental_terms_per_profile
1656
+ end
1657
+
1658
+ def compute_term_list_and_childs()
1659
+ suggested_childs = {}
1660
+ total_terms = 0
1661
+ terms_with_more_specific_childs = 0
1662
+ @profiles.each do |id, terms|
1663
+ total_terms += terms.length
1664
+ more_specific_childs = self.get_childs_table(terms, true)
1665
+ terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
1666
+ suggested_childs[id] = more_specific_childs
1667
+ end
1668
+ return suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
1669
+ end
1670
+
1645
1671
  # Calculates mean IC of a given profile
1646
1672
  # ===== Parameters
1647
1673
  # +prof+:: profile to be checked
@@ -2215,7 +2241,24 @@ class Ontology
2215
2241
  return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
2216
2242
  end
2217
2243
 
2244
+ def each(att = false)
2245
+ @stanzas[:terms].each do |id, tags|
2246
+ next if @alternatives_index.include?(id)
2247
+ if att
2248
+ yield(id, tags)
2249
+ else
2250
+ yield(id)
2251
+ end
2252
+ end
2253
+ end
2218
2254
 
2255
+ def list_term_attributes
2256
+ terms = []
2257
+ each do |code|
2258
+ terms << [code, translate_id(code), get_term_level(code)]
2259
+ end
2260
+ return terms
2261
+ end
2219
2262
 
2220
2263
  #============================================================================
2221
2264
  #============================================================================
@@ -2414,6 +2457,28 @@ class Ontology
2414
2457
  return Math.log(pvalA)/Math.log(pvalB)
2415
2458
  end
2416
2459
 
2460
+ def profile_stats
2461
+ stats = Hash.new(0)
2462
+ data = @profiles.values.map{|ont_ids| ont_ids.size}
2463
+ stats[:average] = data.sum().fdiv(data.size)
2464
+ sum_devs = data.sum{|element| (element - stats[:avg]) ** 2}
2465
+ stats[:variance] = sum_devs.fdiv(data.size)
2466
+ stats[:standardDeviation] = stats[:variance] ** 0.5
2467
+ stats[:max] = data.max
2468
+ stats[:min] = data.min
2469
+
2470
+ stats[:count] = data.size
2471
+ data.each do |value|
2472
+ stats[:countNonZero] += 1 if value != 0
2473
+ end
2474
+
2475
+ stats[:q1] = data.get_quantiles(0.25)
2476
+ stats[:median] = data.get_quantiles(0.5)
2477
+ stats[:q3] = data.get_quantiles(0.75)
2478
+ return stats
2479
+
2480
+ end
2481
+
2417
2482
  #============================================================================
2418
2483
  #============================================================================
2419
2484
 
@@ -92,7 +92,7 @@ end
92
92
  # +charsToRemove+:: char (or chars set) to be removed from texts to be compared
93
93
  # +unique+:: boolean flag which indicates if repeated elements must be removed
94
94
  # Returns the similarity percentage for all elements into array
95
- def similitude_network(items_array, splitChar = ";", charsToRemove = "", unique = false)
95
+ def similitude_network(items_array, splitChar: ";", charsToRemove: "", unique: false)
96
96
  # Special cases
97
97
  return nil if items_array.nil?
98
98
  return nil if !items_array.is_a? Array
@@ -1,3 +1,3 @@
1
1
  module Semtools
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.8"
3
3
  end
data/lib/semtools.rb CHANGED
@@ -1,6 +1,5 @@
1
1
  require "semtools/version"
2
2
  require "semtools/sim_handler"
3
- require "semtools/math_methods"
4
3
  require "semtools/ontology"
5
4
 
6
5
  module Semtools
data/semtools.gemspec CHANGED
@@ -31,6 +31,8 @@ Gem::Specification.new do |spec|
31
31
  spec.require_paths = ["lib"]
32
32
 
33
33
  spec.add_dependency "text"
34
+ spec.add_dependency "down"
35
+ spec.add_dependency "expcalc"
34
36
 
35
37
  spec.add_development_dependency "rake"
36
38
  spec.add_development_dependency "rspec"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: semtools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - seoanezonjic
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-05-27 00:00:00.000000000 Z
12
+ date: 2022-03-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: text
@@ -25,6 +25,34 @@ dependencies:
25
25
  - - ">="
26
26
  - !ruby/object:Gem::Version
27
27
  version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: down
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: expcalc
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ type: :runtime
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
28
56
  - !ruby/object:Gem::Dependency
29
57
  name: rake
30
58
  requirement: !ruby/object:Gem::Requirement
@@ -75,6 +103,7 @@ email:
75
103
  executables:
76
104
  - console
77
105
  - onto2json.rb
106
+ - semtools.rb
78
107
  - setup
79
108
  - strsimnet.rb
80
109
  extensions: []
@@ -90,8 +119,10 @@ files:
90
119
  - Rakefile
91
120
  - bin/console
92
121
  - bin/onto2json.rb
122
+ - bin/semtools.rb
93
123
  - bin/setup
94
124
  - bin/strsimnet.rb
125
+ - external_data/ontologies.txt
95
126
  - lib/data/hp.obo
96
127
  - lib/data/phenotype_annotation.tab
97
128
  - lib/semtools.rb
@@ -119,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
150
  - !ruby/object:Gem::Version
120
151
  version: '0'
121
152
  requirements: []
122
- rubygems_version: 3.2.3
153
+ rubygems_version: 3.2.15
123
154
  signing_key:
124
155
  specification_version: 4
125
156
  summary: Gem to handle semantic based calculations in text and defined ontologies