semtools 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 83746b4834f16f9bffc404c578be30e427bf90c7f43ba1f55bd04d94a29186c5
4
- data.tar.gz: 25f0bd67c733d4289f2e790bc857886a184dd95677beaf378da9074957660e69
3
+ metadata.gz: a3f63cc6548a9938e31121d2018d1c1c477987007c5d253b5fa814a285bdb576
4
+ data.tar.gz: e1911d3157c3046590ca13bc86215d2260b4a8b2b1b25affa5c2673881036795
5
5
  SHA512:
6
- metadata.gz: 602fd8d61f9e9f34c2de957dd4d311a510413eea2bb0d1794b4d2da6f4e8959d3919dd8e9ea4a0d9f5d4e962443b7f50675e075d385830aeb9ef08ecb38d3fe2
7
- data.tar.gz: 5b9f66a1fef9c3296e5fe203330e3575e4fecf0f363fc8b884abd56e8fbcb1bd5dac3563792a4a5da908bcee409e2d36596751123aa2012d5cc9a8a5ff3c7796
6
+ metadata.gz: 30c95df80957a4a35b6fea05b9552352f529d8e45c10f6b128924a3ce2ee5d90e92a1e9d5fe0016d25538147e12d3a9199c81222642c94cdd0eb3c89eea168ef
7
+ data.tar.gz: ddc9e600fd984e68d060b7be05adf27b3f20bb67e638d42acc4b9b156eedabfce20d6f588a03d1fbc2948fedbd80d498f1767c0e3f8ea03720fa0ca327b95f3c
data/Gemfile CHANGED
@@ -5,5 +5,8 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
5
5
  # Specify your gem's dependencies in semtools.gemspec
6
6
  gemspec
7
7
 
8
- gem "rake", "~> 12.0"
8
+ gem "rake", "~> 13.0"
9
9
  gem "minitest", "~> 5.0"
10
+
11
+ expcalc_dev_path = File.expand_path('~/dev_gems/expcalc')
12
+ gem "expcalc", github: "seoanezonjic/expcalc", branch: "master" if Dir.exist?(expcalc_dev_path)
data/bin/semtools.rb ADDED
@@ -0,0 +1,446 @@
1
+ #! /usr/bin/env ruby
2
+ ROOT_PATH = File.dirname(__FILE__)
3
+ $LOAD_PATH.unshift(File.expand_path(File.join(ROOT_PATH, '..', 'lib')))
4
+ EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
5
+
6
+ require 'optparse'
7
+ require 'down'
8
+ require 'semtools'
9
+
10
+ ######################################################################################
11
+ ## METHODS
12
+ ######################################################################################
13
+ def load_tabular_file(file)
14
+ records = []
15
+ File.open(file).each do |line|
16
+ line.chomp!
17
+ fields = line.split("\t")
18
+ records << fields
19
+ end
20
+ return records
21
+ end
22
+
23
+ def store_profiles(file, ontology)
24
+ file.each do |id, terms|
25
+ ontology.add_profile(id, terms)
26
+ end
27
+ end
28
+
29
+ def load_value(hash_to_load, key, value, unique = true)
30
+ query = hash_to_load[key]
31
+ if query.nil?
32
+ value = [value] if value.class != Array
33
+ hash_to_load[key] = value
34
+ else
35
+ if value.class == Array
36
+ query.concat(value)
37
+ else
38
+ query << value
39
+ end
40
+ query.uniq! unless unique == nil
41
+ end
42
+ end
43
+
44
+ def translate(ontology, type, options, profiles = nil)
45
+ not_translated = {}
46
+ if type == 'names'
47
+ ontology.profiles.each do |id, terms|
48
+ translation, untranslated = ontology.translate_ids(terms)
49
+ ontology.profiles[id] = translation
50
+ not_translated[id] = untranslated unless untranslated.empty?
51
+ end
52
+ elsif type == 'codes'
53
+ profiles.each do |id,terms|
54
+ translation, untranslated = ontology.translate_names(terms)
55
+ profiles[id] = translation
56
+ profiles[id] = profiles[id].join("#{options[:separator]}")
57
+ not_translated[id] = untranslated unless untranslated.empty?
58
+ end
59
+ end
60
+ if !not_translated.empty?
61
+ File.open(options[:untranslated_path], 'w') do |file|
62
+ not_translated.each do |id, terms|
63
+ file.puts([id, terms.join(";")].join("\t"))
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ def clean_profile(profile, ontology, options)
70
+ cleaned_profile = ontology.clean_profile_hard(profile)
71
+ unless options[:term_filter].nil?
72
+ cleaned_profile.select! {|term| ontology.get_ancestors(term).include?(options[:term_filter])}
73
+ end
74
+ return cleaned_profile
75
+ end
76
+
77
+ def clean_profiles(profiles, ontology, options)
78
+ removed_profiles = []
79
+ profiles.each do |id, terms|
80
+ cleaned_profile = clean_profile(terms, ontology, options)
81
+ profiles[id] = cleaned_profile
82
+ removed_profiles << id if cleaned_profile.empty?
83
+ end
84
+ removed_profiles.each{|rp| profiles.delete(rp)}
85
+ return removed_profiles
86
+ end
87
+
88
+ def expand_profiles(profiles, ontology, unwanted_terms = [])
89
+ profiles.each do |disease_id, terms|
90
+ terms.each do |term|
91
+ profiles[disease_id] << ontology.get_ancestors(term).difference(unwanted_terms)
92
+ end
93
+ end
94
+ end
95
+
96
+ def write_similarity_profile_list(input, onto_obj, similarity_type)
97
+ similarity_file = File.basename(input, ".*")+'_semantic_similarity_list'
98
+ File.open(similarity_file, 'w') do |file|
99
+ onto_obj.profiles.each do |profile_query_key, profile_query_value|
100
+ onto_obj.profiles.each do |profile_search_key, profile_search_value|
101
+ file.puts([profile_query_key, profile_search_key, onto_obj.compare(profile_query_value, profile_search_value, sim_type: similarity_type)].join("\t"))
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ def download(source, key, output)
108
+ source_list = load_tabular_file(source).to_h
109
+ external_data = File.dirname(source)
110
+ if key == 'list'
111
+ Dir.glob(File.join(external_data,'*.obo')){|f| puts f}
112
+ else
113
+ url = source_list[key]
114
+ if !output.nil?
115
+ output_path = output
116
+ else
117
+ file_name = key + '.obo'
118
+ if File.writable?(external_data)
119
+ output_path = File.join(external_data, file_name)
120
+ else
121
+ output_path = file_name
122
+ end
123
+ end
124
+ if !url.nil?
125
+ Down::NetHttp.download(url, destination: output_path, max_redirects: 5)
126
+ File.chmod(0644, output_path) # Correct file permissions set by down gem
127
+ end
128
+ end
129
+ end
130
+
131
+ def get_ontology_file(path, source)
132
+ if !File.exists?(path)
133
+ ont_index = load_tabular_file(source).to_h
134
+ if !ont_index[path].nil?
135
+ path = File.join(File.dirname(source), path + '.obo')
136
+ else
137
+ abort("Input ontology file not exists")
138
+ end
139
+ end
140
+ return path
141
+ end
142
+
143
+ def get_stats(stats)
144
+ report_stats = []
145
+ report_stats << ['Elements', stats[:count]]
146
+ report_stats << ['Elements Non Zero', stats[:countNonZero]]
147
+ report_stats << ['Non Zero Density', stats[:countNonZero].fdiv(stats[:count])]
148
+ report_stats << ['Max', stats[:max]]
149
+ report_stats << ['Min', stats[:min]]
150
+ report_stats << ['Average', stats[:average]]
151
+ report_stats << ['Variance', stats[:variance]]
152
+ report_stats << ['Standard Deviation', stats[:standardDeviation]]
153
+ report_stats << ['Q1', stats[:q1]]
154
+ report_stats << ['Median', stats[:median]]
155
+ report_stats << ['Q3', stats[:q3]]
156
+ return report_stats
157
+ end
158
+
159
+
160
+
161
+
162
+
163
+
164
+ ####################################################################################
165
+ ## OPTPARSE
166
+ ####################################################################################
167
+ options = {}
168
+ OptionParser.new do |opts|
169
+ opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
170
+
171
+ options[:download] = nil
172
+ opts.on("-d", "--download STRING", "Download obo file from official resource. MONDO, GO and HPO are possible values.") do |item|
173
+ options[:download] = item
174
+ end
175
+
176
+ options[:input_file] = nil
177
+ opts.on("-i", "--input_file PATH", "Filepath of profile data") do |item|
178
+ options[:input_file] = item
179
+ end
180
+
181
+ options[:output_file] = nil
182
+ opts.on("-o", "--output_file PATH", "Output filepath") do |item|
183
+ options[:output_file] = item
184
+ end
185
+
186
+ options[:IC] = false
187
+ opts.on("-I", "--IC", "Get IC") do
188
+ options[:IC] = true
189
+ end
190
+
191
+ options[:ontology_file] = nil
192
+ opts.on("-O PATH", "--ontology_file PATH", "Path to ontology file") do |item|
193
+ options[:ontology_file] = item
194
+ end
195
+
196
+ options[:term_filter] = nil
197
+ opts.on("-T STRING", "--term_filter STRING", "If specified, only terms that are descendants of the specified term will be kept on a profile when cleaned") do |item|
198
+ options[:term_filter] = item.to_sym
199
+ end
200
+
201
+ options[:translate] = nil
202
+ opts.on("-t STRING", "--translate STRING", "Translate to 'names' or to 'codes'") do |item|
203
+ options[:translate] = item
204
+ end
205
+
206
+ opts.on("-s method", "--similarity method", "Calculate similarity between profile IDs computed by 'resnik', 'lin' or 'jiang_conrath' methods. ") do |sim_method|
207
+ options[:similarity] = sim_method.to_sym
208
+ end
209
+
210
+ options[:clean_profiles] = false
211
+ opts.on("-c", "--clean_profiles", "Removes ancestors, descendants and obsolete terms from profiles") do
212
+ options[:clean_profiles] = true
213
+ end
214
+
215
+ options[:removed_path] = 'rejected_profs'
216
+ opts.on("-r PATH", "--removed_path PATH", "Desired path to write removed profiles file") do |item|
217
+ options[:removed_path] = item
218
+ end
219
+
220
+ options[:untranslated_path] = nil
221
+ opts.on("-u PATH", "--untranslated_path PATH", "Desired path to write untranslated terms file") do |item|
222
+ options[:untranslated_path] = item
223
+ end
224
+
225
+ options[:keyword] = nil
226
+ opts.on("-k STRING", "--keyword STRING", "regex used to get xref terms in the ontology file") do |item|
227
+ options[:keyword] = item
228
+ end
229
+
230
+ options[:xref_sense] = :byValue
231
+ opts.on("--xref_sense ", "Ontology-xref or xref-ontology. By default xref-ontology if set, ontology-xref") do
232
+ options[:xref_sense] = :byTerm
233
+ end
234
+
235
+ options[:expand_profiles] = false
236
+ opts.on("-e", "--expand_profiles", "Expand profiles adding ancestors") do
237
+ options[:expand_profiles] = true
238
+ end
239
+
240
+ options[:unwanted_terms] = []
241
+ opts.on("-U", "--unwanted_terms STRING", "Comma separated terms not wanted to be included in profile expansion") do |item|
242
+ options[:unwanted_terms] = item
243
+ end
244
+
245
+ options[:separator] = ";"
246
+ opts.on("-S STRING", "--separator STRING", "Separator used for the terms profile") do |sep|
247
+ options[:separator] = sep
248
+ end
249
+
250
+ options[:childs] = [[], '']
251
+ opts.on("-C STRING", "--childs STRING", "Term code list (comma separated) to generate child list") do |item|
252
+ if item.include?('/')
253
+ modifiers, terms = item.split('/')
254
+ else
255
+ modifiers = ''
256
+ terms = item
257
+ end
258
+ terms = terms.split(',').map{|t| t.to_sym}
259
+ options[:childs] = [terms, modifiers]
260
+ end
261
+
262
+ options[:statistics] = false
263
+ opts.on("-n", "--statistics", "To obtain main statistical descriptors of the profiles file") do
264
+ options[:statistics] = true
265
+ end
266
+
267
+ options[:list_translate] = nil
268
+ opts.on("-l STRING", "--list_translate STRING", "Translate to 'names' or to 'codes' input list") do |sep|
269
+ options[:list_translate] = sep
270
+ end
271
+
272
+ options[:subject_column] = 0
273
+ opts.on("-f NUM", "--subject_column NUM", "The number of the column for the subject id") do |ncol|
274
+ options[:subject_column] = ncol.to_i
275
+ end
276
+
277
+ options[:annotations_column] = 1
278
+ opts.on("-a NUM", "--annotations_column NUM", "The number of the column for the annotation ids") do |ncol|
279
+ options[:annotations_column] = ncol.to_i
280
+ end
281
+
282
+
283
+ options[:list_term_attributes] = false
284
+ opts.on("--list_term_attributes", "The number of the column for the annotation ids") do
285
+ options[:list_term_attributes] = true
286
+ end
287
+
288
+ end.parse!
289
+
290
+ ####################################################################################
291
+ ## MAIN
292
+ ####################################################################################
293
+ ont_index_file = File.join(EXTERNAL_DATA, 'ontologies.txt')
294
+ if !options[:download].nil?
295
+ download(ont_index_file, options[:download], options[:output_file])
296
+ Process.exit
297
+ end
298
+
299
+ if !options[:ontology_file].nil?
300
+ options[:ontology_file] = get_ontology_file(options[:ontology_file], ont_index_file)
301
+ end
302
+ ontology = Ontology.new(file: options[:ontology_file], load_file: true)
303
+
304
+ if !options[:input_file].nil?
305
+ data = load_tabular_file(options[:input_file])
306
+ if options[:list_translate].nil? || !options[:keyword].nil?
307
+ data.map!{|row|
308
+ [row[options[:subject_column]],
309
+ row[options[:annotations_column]].split(options[:separator]).map!{|term| term.to_sym}]
310
+ }
311
+ store_profiles(data, ontology) if options[:translate] != 'codes' && options[:keyword].nil?
312
+ end
313
+ end
314
+
315
+ if !options[:list_translate].nil?
316
+ data.each do |term|
317
+ if options[:list_translate] == 'names'
318
+ translation, untranslated = ontology.translate_ids(term)
319
+ elsif options[:list_translate] == 'codes'
320
+ translation, untranslated = ontology.translate_names(term)
321
+ end
322
+ puts "#{term.first}\t#{translation.empty? ? '-' : translation.first}"
323
+ end
324
+ Process.exit
325
+ end
326
+
327
+ if options[:translate] == 'codes'
328
+ profiles = {}
329
+ data.each do |id, terms|
330
+ load_value(profiles, id, terms)
331
+ profiles[id] = terms.split(options[:separator])
332
+ end
333
+ translate(ontology, 'codes', options, profiles)
334
+ store_profiles(profiles, ontology)
335
+ end
336
+
337
+ if options[:clean_profiles]
338
+ removed_profiles = clean_profiles(ontology.profiles, ontology, options)
339
+ if !removed_profiles.nil? && !removed_profiles.empty?
340
+ File.open(options[:removed_path], 'w') do |f|
341
+ removed_profiles.each do |profile|
342
+ f.puts profile
343
+ end
344
+ end
345
+ end
346
+ end
347
+
348
+ if options[:expand_profiles]
349
+ expanded_profiles = expand_profiles(ontology.profiles, ontology, options[:unwanted_terms])
350
+ end
351
+
352
+ if !options[:similarity].nil?
353
+ write_similarity_profile_list(input = options[:input_file], onto_obj=ontology, similarity_type = options[:similarity])
354
+ end
355
+
356
+
357
+ if options[:IC]
358
+ ontology.add_observed_terms_from_profiles
359
+ by_ontology, by_freq = ontology.get_profiles_resnik_dual_ICs
360
+ ic_file = File.basename(options[:input_file], ".*")+'_IC_onto_freq'
361
+ File.open(ic_file , 'w') do |file|
362
+ ontology.profiles.keys.each do |id|
363
+ file.puts([id, by_ontology[id], by_freq[id]].join("\t"))
364
+ end
365
+ end
366
+ end
367
+
368
+ if options[:translate] == 'names'
369
+ translate(ontology, 'names', options)
370
+ end
371
+
372
+ if !options[:childs].first.empty?
373
+ terms, modifiers = options[:childs]
374
+ all_childs = []
375
+ terms.each do |term|
376
+ childs = ontology.get_descendants(term)
377
+ all_childs = all_childs | childs
378
+ end
379
+ if modifiers.include?('r')
380
+ relations = []
381
+ all_childs = all_childs | terms # Add parents that generated child list
382
+ all_childs.each do |term|
383
+ descendants = ontology.get_direct_descendants(term)
384
+ if !descendants.nil?
385
+ descendants.each do |desc|
386
+ relations << [term, desc]
387
+ end
388
+ end
389
+ end
390
+ relations.each do |rel|
391
+ rel, _ = ontology.translate_ids(rel) if modifiers.include?('n')
392
+ puts rel.join("\t")
393
+ end
394
+ else
395
+ all_childs.each do |c|
396
+ if modifiers.include?('n')
397
+ puts ontology.translate_id(c)
398
+ else
399
+ puts c
400
+ end
401
+ end
402
+ end
403
+ end
404
+
405
+ if !options[:output_file].nil?
406
+ File.open(options[:output_file], 'w') do |file|
407
+ ontology.profiles.each do |id, terms|
408
+ file.puts([id, terms.join("|")].join("\t"))
409
+ end
410
+ end
411
+ end
412
+
413
+ if options[:statistics]
414
+ get_stats(ontology.profile_stats).each do |stat|
415
+ puts stat.join("\t")
416
+ end
417
+ end
418
+
419
+ if options[:list_term_attributes]
420
+ term_attributes = ontology.list_term_attributes
421
+ term_attributes.each do |t_attr|
422
+ t_attr[0] = t_attr[0].to_s
423
+ puts t_attr.join("\t")
424
+ end
425
+ end
426
+
427
+ if !options[:keyword].nil?
428
+ xref_translated = []
429
+ ontology.calc_dictionary(:xref, select_regex: /(#{options[:keyword]})/, store_tag: :tag, multiterm: true, substitute_alternatives: false)
430
+ dict = ontology.dicts[:tag][options[:xref_sense]]
431
+ data.each do |id, prof|
432
+ xrefs = []
433
+ prof.each do |t|
434
+ query = dict[t.to_s]
435
+ xrefs.concat(query) if !query.nil?
436
+ end
437
+ xref_translated << [id, xrefs] if !xrefs.empty?
438
+ end
439
+ File.open(options[:output_file], 'w') do |f|
440
+ xref_translated.each do |id, prof|
441
+ prof.each do |t|
442
+ f.puts [id, t].join("\t")
443
+ end
444
+ end
445
+ end
446
+ end
data/bin/strsimnet.rb CHANGED
@@ -111,12 +111,11 @@ texts2compare = load_table_file(input_file = options[:input_file],
111
111
  targetCol = options[:cindex],
112
112
  filterCol = options[:findex],
113
113
  filterValue = options[:filter_value])
114
-
115
114
  # Verbose point
116
115
  puts "Calculating similitude for (" + texts2compare.length.to_s + ") elements"
117
116
 
118
117
  # Obtain all Vs all
119
- similitudes_AllVsAll = similitude_network(texts2compare,options[:rm_char])
118
+ similitudes_AllVsAll = similitude_network(texts2compare, charsToRemove: options[:rm_char])
120
119
 
121
120
  # Verbose point
122
121
  puts "Writing output file ..."
@@ -0,0 +1,4 @@
1
+ GO http://purl.obolibrary.org/obo/go/go-basic.obo
2
+ HPO http://purl.obolibrary.org/obo/hp.obo
3
+ MONDO http://purl.obolibrary.org/obo/mondo.obo
4
+ EFO http://www.ebi.ac.uk/efo/efo.obo
@@ -1,148 +1,148 @@
1
- # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
2
- #to cmpute fisher exact test
3
- #Fisher => http://www.biostathandbook.com/fishers.html
4
- def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
5
- #puts '-', listA.inspect, listB.inspect, '-'
6
- listA_listB = listA & listB
7
- listA_nolistB = listA - listB
8
- nolistA_listB = listB - listA
9
- if weigths.nil?
10
- listA_listB_count = listA_listB.length
11
- listA_nolistB_count = listA_nolistB.length
12
- nolistA_listB_count = nolistA_listB.length
13
- nolistA_nolistB_count = all_elements_count - (listA | listB).length
14
- else
15
- # Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
16
- # https://academic.oup.com/bioinformatics/article/22/13/1600/193669
17
- listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
18
- listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
19
- nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
1
+ # # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
2
+ # #to cmpute fisher exact test
3
+ # #Fisher => http://www.biostathandbook.com/fishers.html
4
+ # def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
5
+ # #puts '-', listA.inspect, listB.inspect, '-'
6
+ # listA_listB = listA & listB
7
+ # listA_nolistB = listA - listB
8
+ # nolistA_listB = listB - listA
9
+ # if weigths.nil?
10
+ # listA_listB_count = listA_listB.length
11
+ # listA_nolistB_count = listA_nolistB.length
12
+ # nolistA_listB_count = nolistA_listB.length
13
+ # nolistA_nolistB_count = all_elements_count - (listA | listB).length
14
+ # else
15
+ # # Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
16
+ # # https://academic.oup.com/bioinformatics/article/22/13/1600/193669
17
+ # listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
18
+ # listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
19
+ # nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
20
20
 
21
- if partial_weigths
22
- nolistA_nolistB_count = all_elements_count - (listA | listB).length
23
- all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
24
- else
25
- nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
26
- all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
27
- end
28
- end
29
- #puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
30
- if tail == 'two_sided'
31
- accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
32
- elsif tail == 'less'
33
- accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
34
- end
35
- return accumulated_prob
36
- end
21
+ # if partial_weigths
22
+ # nolistA_nolistB_count = all_elements_count - (listA | listB).length
23
+ # all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
24
+ # else
25
+ # nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
26
+ # all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
27
+ # end
28
+ # end
29
+ # #puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
30
+ # if tail == 'two_sided'
31
+ # accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
32
+ # elsif tail == 'less'
33
+ # accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
34
+ # end
35
+ # return accumulated_prob
36
+ # end
37
37
 
38
- def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
39
- #https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
40
- accumulated_prob = 0
41
- ref_prob = compute_hyper_prob(
42
- listA_listB_count,
43
- listA_nolistB_count,
44
- nolistA_listB_count,
45
- nolistA_nolistB_count,
46
- all_elements_count
47
- )
48
- accumulated_prob += ref_prob
49
- [listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
50
- n += 1
51
- prob = compute_hyper_prob(
52
- listA_listB_count - n,
53
- listA_nolistB_count + n,
54
- nolistA_listB_count + n,
55
- nolistA_nolistB_count - n,
56
- all_elements_count
57
- )
58
- prob <= ref_prob ? accumulated_prob += prob : break
59
- end
38
+ # def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
39
+ # #https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
40
+ # accumulated_prob = 0
41
+ # ref_prob = compute_hyper_prob(
42
+ # listA_listB_count,
43
+ # listA_nolistB_count,
44
+ # nolistA_listB_count,
45
+ # nolistA_nolistB_count,
46
+ # all_elements_count
47
+ # )
48
+ # accumulated_prob += ref_prob
49
+ # [listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
50
+ # n += 1
51
+ # prob = compute_hyper_prob(
52
+ # listA_listB_count - n,
53
+ # listA_nolistB_count + n,
54
+ # nolistA_listB_count + n,
55
+ # nolistA_nolistB_count - n,
56
+ # all_elements_count
57
+ # )
58
+ # prob <= ref_prob ? accumulated_prob += prob : break
59
+ # end
60
60
 
61
- [listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
62
- n += 1
63
- prob = compute_hyper_prob(
64
- listA_listB_count + n,
65
- listA_nolistB_count - n,
66
- nolistA_listB_count - n,
67
- nolistA_nolistB_count + n,
68
- all_elements_count
69
- )
70
- accumulated_prob += prob if prob <= ref_prob
71
- end
61
+ # [listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
62
+ # n += 1
63
+ # prob = compute_hyper_prob(
64
+ # listA_listB_count + n,
65
+ # listA_nolistB_count - n,
66
+ # nolistA_listB_count - n,
67
+ # nolistA_nolistB_count + n,
68
+ # all_elements_count
69
+ # )
70
+ # accumulated_prob += prob if prob <= ref_prob
71
+ # end
72
72
 
73
- return accumulated_prob
74
- end
73
+ # return accumulated_prob
74
+ # end
75
75
 
76
- def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
77
- accumulated_prob = 0
78
- [listA_listB_count, nolistA_nolistB_count].min.times do |n|
79
- accumulated_prob += compute_hyper_prob(
80
- listA_listB_count - n,
81
- listA_nolistB_count + n,
82
- nolistA_listB_count + n,
83
- nolistA_nolistB_count - n,
84
- all_elements_count
85
- )
86
- end
87
- return accumulated_prob
88
- end
76
+ # def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
77
+ # accumulated_prob = 0
78
+ # [listA_listB_count, nolistA_nolistB_count].min.times do |n|
79
+ # accumulated_prob += compute_hyper_prob(
80
+ # listA_listB_count - n,
81
+ # listA_nolistB_count + n,
82
+ # nolistA_listB_count + n,
83
+ # nolistA_nolistB_count - n,
84
+ # all_elements_count
85
+ # )
86
+ # end
87
+ # return accumulated_prob
88
+ # end
89
89
 
90
- def compute_hyper_prob(a, b, c, d, n)
91
- # https://en.wikipedia.org/wiki/Fisher%27s_exact_test
92
- binomA = binom(a + b, a)
93
- binomC = binom(c + d, c)
94
- divisor = binom(n, a + c)
95
- return (binomA * binomC).fdiv(divisor)
96
- end
90
+ # def compute_hyper_prob(a, b, c, d, n)
91
+ # # https://en.wikipedia.org/wiki/Fisher%27s_exact_test
92
+ # binomA = binom(a + b, a)
93
+ # binomC = binom(c + d, c)
94
+ # divisor = binom(n, a + c)
95
+ # return (binomA * binomC).fdiv(divisor)
96
+ # end
97
97
 
98
- def binom(n,k)
99
- if k > 0 && k < n
100
- res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
101
- else
102
- res = 1
103
- end
104
- end
98
+ # def binom(n,k)
99
+ # if k > 0 && k < n
100
+ # res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
101
+ # else
102
+ # res = 1
103
+ # end
104
+ # end
105
105
 
106
- #to cmpute adjusted pvalues
107
- #https://rosettacode.org/wiki/P-value_correction#Ruby
108
- def get_benjaminiHochberg_pvalues(arr_pvalues)
109
- n = arr_pvalues.length
110
- arr_o = order(arr_pvalues, true)
111
- arr_cummin_input = []
112
- (0..(n - 1)).each do |i|
113
- arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
114
- end
115
- arr_ro = order(arr_o)
116
- arr_cummin = cummin(arr_cummin_input)
117
- arr_pmin = pmin(arr_cummin)
118
- return arr_pmin.values_at(*arr_ro)
119
- end
106
+ # #to cmpute adjusted pvalues
107
+ # #https://rosettacode.org/wiki/P-value_correction#Ruby
108
+ # def get_benjaminiHochberg_pvalues(arr_pvalues)
109
+ # n = arr_pvalues.length
110
+ # arr_o = order(arr_pvalues, true)
111
+ # arr_cummin_input = []
112
+ # (0..(n - 1)).each do |i|
113
+ # arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
114
+ # end
115
+ # arr_ro = order(arr_o)
116
+ # arr_cummin = cummin(arr_cummin_input)
117
+ # arr_pmin = pmin(arr_cummin)
118
+ # return arr_pmin.values_at(*arr_ro)
119
+ # end
120
120
 
121
- def order(array, decreasing = false)
122
- if decreasing == false
123
- array.sort.map { |n| array.index(n) }
124
- else
125
- array.sort.map { |n| array.index(n) }.reverse
126
- end
127
- end
121
+ # def order(array, decreasing = false)
122
+ # if decreasing == false
123
+ # array.sort.map { |n| array.index(n) }
124
+ # else
125
+ # array.sort.map { |n| array.index(n) }.reverse
126
+ # end
127
+ # end
128
128
 
129
- def cummin(array)
130
- cumulative_min = array.first
131
- arr_cummin = []
132
- array.each do |p|
133
- cumulative_min = [p, cumulative_min].min
134
- arr_cummin << cumulative_min
135
- end
136
- return arr_cummin
137
- end
129
+ # def cummin(array)
130
+ # cumulative_min = array.first
131
+ # arr_cummin = []
132
+ # array.each do |p|
133
+ # cumulative_min = [p, cumulative_min].min
134
+ # arr_cummin << cumulative_min
135
+ # end
136
+ # return arr_cummin
137
+ # end
138
138
 
139
- def pmin(array)
140
- x = 1
141
- pmin_array = []
142
- array.each_index do |i|
143
- pmin_array[i] = [array[i], x].min
144
- abort if pmin_array[i] > 1
145
- end
146
- return pmin_array
147
- end
139
+ # def pmin(array)
140
+ # x = 1
141
+ # pmin_array = []
142
+ # array.each_index do |i|
143
+ # pmin_array[i] = [array[i], x].min
144
+ # abort if pmin_array[i] > 1
145
+ # end
146
+ # return pmin_array
147
+ # end
148
148
 
@@ -1,3 +1,4 @@
1
+ require 'expcalc'
1
2
  require 'json'
2
3
  require 'colorize'
3
4
 
@@ -45,7 +46,7 @@ class Ontology
45
46
  @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
46
47
  @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
47
48
  @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
48
-
49
+
49
50
  #############################################
50
51
  # CONSTRUCTOR
51
52
  #############################################
@@ -202,6 +203,7 @@ class Ontology
202
203
  # Only TERMS multivalue tags (future add Typedefs and Instance)
203
204
  # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
204
205
  attributes.each do |tag, value|
206
+ value.gsub!(/{source=[\\\":A-Za-z0-9\/\.\-, =]+} /, '') if tag == 'is_a' # To delete "source" attributes in is_a tag of MONDO ontology
205
207
  # Check
206
208
  raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
207
209
  # Prepare
@@ -553,14 +555,14 @@ class Ontology
553
555
  self.get_index_obsoletes
554
556
  self.get_index_alternatives
555
557
  self.get_index_child_parent_relations
556
- @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
558
+ @alternatives_index.each{|k,v| @alternatives_index[k] = self.extract_id(v)}
557
559
  ## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
558
560
  @alternatives_index.compact!
559
- @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
561
+ @obsoletes_index.each{|k,v| @obsoletes_index[k] = self.extract_id(v)}
560
562
  @obsoletes_index.compact!
561
- @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
563
+ @ancestors_index.each{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
562
564
  @ancestors_index.compact!
563
- @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
565
+ @descendants_index.each{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
564
566
  @descendants_index.compact!
565
567
  self.get_index_frequencies
566
568
  self.calc_dictionary(:name)
@@ -721,7 +723,7 @@ class Ontology
721
723
  # an array with all ancestors/descendants of given term or nil if parents are not available yet
722
724
  def get_familiar(term, return_ancestors = true, filter_alternatives = false)
723
725
  # Find into parentals
724
- familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
726
+ familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
725
727
  if !familiars.nil?
726
728
  familiars = familiars.clone
727
729
  if filter_alternatives
@@ -1580,9 +1582,12 @@ class Ontology
1580
1582
  return terms_without_ancestors_and_alternatices
1581
1583
  end
1582
1584
 
1583
- def clean_profile_hard(profile)
1585
+ def clean_profile_hard(profile, options = {})
1584
1586
  profile, _ = check_ids(profile)
1585
1587
  profile = profile.select{|t| !is_obsolete?(t)}
1588
+ if !options[:term_filter].nil?
1589
+ profile.select! {|term| get_ancestors(term).include?(options[:term_filter])}
1590
+ end
1586
1591
  profile = clean_profile(profile.uniq)
1587
1592
  return profile
1588
1593
  end
@@ -1642,6 +1647,27 @@ class Ontology
1642
1647
  end
1643
1648
 
1644
1649
 
1650
+ def get_profile_redundancy()
1651
+ profile_sizes = self.get_profiles_sizes
1652
+ parental_terms_per_profile = self.parentals_per_profile# clean_profiles
1653
+ parental_terms_per_profile = parental_terms_per_profile.map{|item| item[0]}
1654
+ profile_sizes, parental_terms_per_profile = profile_sizes.zip(parental_terms_per_profile).sort_by{|i| i.first}.reverse.transpose
1655
+ return profile_sizes, parental_terms_per_profile
1656
+ end
1657
+
1658
+ def compute_term_list_and_childs()
1659
+ suggested_childs = {}
1660
+ total_terms = 0
1661
+ terms_with_more_specific_childs = 0
1662
+ @profiles.each do |id, terms|
1663
+ total_terms += terms.length
1664
+ more_specific_childs = self.get_childs_table(terms, true)
1665
+ terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
1666
+ suggested_childs[id] = more_specific_childs
1667
+ end
1668
+ return suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
1669
+ end
1670
+
1645
1671
  # Calculates mean IC of a given profile
1646
1672
  # ===== Parameters
1647
1673
  # +prof+:: profile to be checked
@@ -2215,7 +2241,24 @@ class Ontology
2215
2241
  return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
2216
2242
  end
2217
2243
 
2244
+ def each(att = false)
2245
+ @stanzas[:terms].each do |id, tags|
2246
+ next if @alternatives_index.include?(id)
2247
+ if att
2248
+ yield(id, tags)
2249
+ else
2250
+ yield(id)
2251
+ end
2252
+ end
2253
+ end
2218
2254
 
2255
+ def list_term_attributes
2256
+ terms = []
2257
+ each do |code|
2258
+ terms << [code, translate_id(code), get_term_level(code)]
2259
+ end
2260
+ return terms
2261
+ end
2219
2262
 
2220
2263
  #============================================================================
2221
2264
  #============================================================================
@@ -2414,6 +2457,28 @@ class Ontology
2414
2457
  return Math.log(pvalA)/Math.log(pvalB)
2415
2458
  end
2416
2459
 
2460
+ def profile_stats
2461
+ stats = Hash.new(0)
2462
+ data = @profiles.values.map{|ont_ids| ont_ids.size}
2463
+ stats[:average] = data.sum().fdiv(data.size)
2464
+ sum_devs = data.sum{|element| (element - stats[:avg]) ** 2}
2465
+ stats[:variance] = sum_devs.fdiv(data.size)
2466
+ stats[:standardDeviation] = stats[:variance] ** 0.5
2467
+ stats[:max] = data.max
2468
+ stats[:min] = data.min
2469
+
2470
+ stats[:count] = data.size
2471
+ data.each do |value|
2472
+ stats[:countNonZero] += 1 if value != 0
2473
+ end
2474
+
2475
+ stats[:q1] = data.get_quantiles(0.25)
2476
+ stats[:median] = data.get_quantiles(0.5)
2477
+ stats[:q3] = data.get_quantiles(0.75)
2478
+ return stats
2479
+
2480
+ end
2481
+
2417
2482
  #============================================================================
2418
2483
  #============================================================================
2419
2484
 
@@ -92,7 +92,7 @@ end
92
92
  # +charsToRemove+:: char (or chars set) to be removed from texts to be compared
93
93
  # +unique+:: boolean flag which indicates if repeated elements must be removed
94
94
  # Returns the similarity percentage for all elements into array
95
- def similitude_network(items_array, splitChar = ";", charsToRemove = "", unique = false)
95
+ def similitude_network(items_array, splitChar: ";", charsToRemove: "", unique: false)
96
96
  # Special cases
97
97
  return nil if items_array.nil?
98
98
  return nil if !items_array.is_a? Array
@@ -1,3 +1,3 @@
1
1
  module Semtools
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.8"
3
3
  end
data/lib/semtools.rb CHANGED
@@ -1,6 +1,5 @@
1
1
  require "semtools/version"
2
2
  require "semtools/sim_handler"
3
- require "semtools/math_methods"
4
3
  require "semtools/ontology"
5
4
 
6
5
  module Semtools
data/semtools.gemspec CHANGED
@@ -31,6 +31,8 @@ Gem::Specification.new do |spec|
31
31
  spec.require_paths = ["lib"]
32
32
 
33
33
  spec.add_dependency "text"
34
+ spec.add_dependency "down"
35
+ spec.add_dependency "expcalc"
34
36
 
35
37
  spec.add_development_dependency "rake"
36
38
  spec.add_development_dependency "rspec"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: semtools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - seoanezonjic
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-05-27 00:00:00.000000000 Z
12
+ date: 2022-03-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: text
@@ -25,6 +25,34 @@ dependencies:
25
25
  - - ">="
26
26
  - !ruby/object:Gem::Version
27
27
  version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: down
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: expcalc
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ type: :runtime
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
28
56
  - !ruby/object:Gem::Dependency
29
57
  name: rake
30
58
  requirement: !ruby/object:Gem::Requirement
@@ -75,6 +103,7 @@ email:
75
103
  executables:
76
104
  - console
77
105
  - onto2json.rb
106
+ - semtools.rb
78
107
  - setup
79
108
  - strsimnet.rb
80
109
  extensions: []
@@ -90,8 +119,10 @@ files:
90
119
  - Rakefile
91
120
  - bin/console
92
121
  - bin/onto2json.rb
122
+ - bin/semtools.rb
93
123
  - bin/setup
94
124
  - bin/strsimnet.rb
125
+ - external_data/ontologies.txt
95
126
  - lib/data/hp.obo
96
127
  - lib/data/phenotype_annotation.tab
97
128
  - lib/semtools.rb
@@ -119,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
150
  - !ruby/object:Gem::Version
120
151
  version: '0'
121
152
  requirements: []
122
- rubygems_version: 3.2.3
153
+ rubygems_version: 3.2.15
123
154
  signing_key:
124
155
  specification_version: 4
125
156
  summary: Gem to handle semantic based calculations in text and defined ontologies