RubyGems - semtools - Versions diffs - 0.1.6 → 0.1.8 - Mend

semtools 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/Gemfile +4 -1
data/bin/semtools.rb +446 -0
data/bin/strsimnet.rb +1 -2
data/external_data/ontologies.txt +4 -0
data/lib/semtools/math_methods.rb +136 -136
data/lib/semtools/ontology.rb +72 -7
data/lib/semtools/sim_handler.rb +1 -1
data/lib/semtools/version.rb +1 -1
data/lib/semtools.rb +0 -1
data/semtools.gemspec +2 -0
metadata +34 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 83746b4834f16f9bffc404c578be30e427bf90c7f43ba1f55bd04d94a29186c5
-  data.tar.gz: 25f0bd67c733d4289f2e790bc857886a184dd95677beaf378da9074957660e69
+  metadata.gz: a3f63cc6548a9938e31121d2018d1c1c477987007c5d253b5fa814a285bdb576
+  data.tar.gz: e1911d3157c3046590ca13bc86215d2260b4a8b2b1b25affa5c2673881036795
 SHA512:
-  metadata.gz: 602fd8d61f9e9f34c2de957dd4d311a510413eea2bb0d1794b4d2da6f4e8959d3919dd8e9ea4a0d9f5d4e962443b7f50675e075d385830aeb9ef08ecb38d3fe2
-  data.tar.gz: 5b9f66a1fef9c3296e5fe203330e3575e4fecf0f363fc8b884abd56e8fbcb1bd5dac3563792a4a5da908bcee409e2d36596751123aa2012d5cc9a8a5ff3c7796
+  metadata.gz: 30c95df80957a4a35b6fea05b9552352f529d8e45c10f6b128924a3ce2ee5d90e92a1e9d5fe0016d25538147e12d3a9199c81222642c94cdd0eb3c89eea168ef
+  data.tar.gz: ddc9e600fd984e68d060b7be05adf27b3f20bb67e638d42acc4b9b156eedabfce20d6f588a03d1fbc2948fedbd80d498f1767c0e3f8ea03720fa0ca327b95f3c

data/Gemfile CHANGED Viewed

@@ -5,5 +5,8 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
 # Specify your gem's dependencies in semtools.gemspec
 gemspec
-gem "rake", "~> 12.0"
+gem "rake", "~> 13.0"
 gem "minitest", "~> 5.0"
+expcalc_dev_path = File.expand_path('~/dev_gems/expcalc')
+gem "expcalc", github: "seoanezonjic/expcalc", branch: "master" if Dir.exist?(expcalc_dev_path)

data/bin/semtools.rb ADDED Viewed

@@ -0,0 +1,446 @@
+#! /usr/bin/env ruby
+ROOT_PATH = File.dirname(__FILE__)
+$LOAD_PATH.unshift(File.expand_path(File.join(ROOT_PATH, '..', 'lib')))
+EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
+require 'optparse'
+require 'down'
+require 'semtools'
+######################################################################################
+## METHODS
+######################################################################################
+def load_tabular_file(file)
+  records = []
+  File.open(file).each do |line|
+    line.chomp!
+    fields = line.split("\t")
+    records << fields
+  end
+  return records
+end
+def store_profiles(file, ontology)
+  file.each do |id, terms|
+    ontology.add_profile(id, terms)
+  end
+end
+def load_value(hash_to_load, key, value, unique = true)
+   	query = hash_to_load[key]
+    if query.nil?
+       value = [value] if value.class != Array
+       hash_to_load[key] = value
+    else
+        if value.class == Array
+            query.concat(value)
+        else
+            query << value
+        end
+        query.uniq! unless unique == nil
+    end
+end
+def translate(ontology, type, options, profiles = nil)
+  not_translated = {}
+  if type == 'names'
+    ontology.profiles.each do |id, terms|
+      translation, untranslated = ontology.translate_ids(terms)
+      ontology.profiles[id] = translation
+      not_translated[id] = untranslated unless untranslated.empty?
+    end
+  elsif type == 'codes'
+    profiles.each do |id,terms|
+      translation, untranslated = ontology.translate_names(terms)
+      profiles[id] = translation
+      profiles[id] = profiles[id].join("#{options[:separator]}")
+      not_translated[id] = untranslated unless untranslated.empty?
+    end
+  end
+  if !not_translated.empty?
+    File.open(options[:untranslated_path], 'w') do |file|
+      not_translated.each do |id, terms|
+          file.puts([id, terms.join(";")].join("\t"))
+      end
+    end
+  end
+end
+def clean_profile(profile, ontology, options)
+	cleaned_profile = ontology.clean_profile_hard(profile)
+	unless options[:term_filter].nil?
+		cleaned_profile.select! {|term| ontology.get_ancestors(term).include?(options[:term_filter])}
+	end
+	return cleaned_profile
+end
+def clean_profiles(profiles, ontology, options)
+	removed_profiles = []
+	profiles.each do |id, terms|
+		cleaned_profile = clean_profile(terms, ontology, options)
+		profiles[id] = cleaned_profile
+		removed_profiles << id if cleaned_profile.empty?
+	end
+	removed_profiles.each{|rp| profiles.delete(rp)}
+	return removed_profiles
+end
+def expand_profiles(profiles, ontology, unwanted_terms = [])
+	profiles.each do |disease_id, terms|
+		terms.each do |term|
+	    	profiles[disease_id] << ontology.get_ancestors(term).difference(unwanted_terms)
+	  end
+	end
+end
+def write_similarity_profile_list(input, onto_obj, similarity_type)
+  similarity_file = File.basename(input, ".*")+'_semantic_similarity_list'
+  File.open(similarity_file, 'w') do |file|
+    onto_obj.profiles.each do |profile_query_key, profile_query_value|
+      onto_obj.profiles.each do |profile_search_key, profile_search_value|
+        file.puts([profile_query_key, profile_search_key, onto_obj.compare(profile_query_value, profile_search_value, sim_type: similarity_type)].join("\t"))
+      end
+    end
+  end
+end
+def download(source, key, output)
+  source_list = load_tabular_file(source).to_h
+  external_data = File.dirname(source)
+  if key == 'list'
+    Dir.glob(File.join(external_data,'*.obo')){|f| puts f}
+  else
+    url = source_list[key]
+    if !output.nil?
+      output_path = output
+    else
+      file_name = key + '.obo'
+      if File.writable?(external_data)
+        output_path = File.join(external_data, file_name)
+      else
+        output_path = file_name
+      end
+    end
+    if !url.nil?
+      Down::NetHttp.download(url, destination: output_path, max_redirects: 5)
+      File.chmod(0644, output_path) # Correct file permissions set by down gem
+    end
+  end
+end
+def get_ontology_file(path, source)
+  if !File.exists?(path)
+    ont_index = load_tabular_file(source).to_h
+    if !ont_index[path].nil?
+      path = File.join(File.dirname(source), path + '.obo')
+    else
+      abort("Input ontology file not exists")
+    end
+  end
+  return path
+end
+def get_stats(stats)
+  report_stats = []
+  report_stats << ['Elements', stats[:count]]
+  report_stats << ['Elements Non Zero', stats[:countNonZero]]
+  report_stats << ['Non Zero Density', stats[:countNonZero].fdiv(stats[:count])]
+  report_stats << ['Max', stats[:max]]
+  report_stats << ['Min', stats[:min]]
+  report_stats << ['Average', stats[:average]]
+  report_stats << ['Variance', stats[:variance]]
+  report_stats << ['Standard Deviation', stats[:standardDeviation]]
+  report_stats << ['Q1', stats[:q1]]
+  report_stats << ['Median', stats[:median]]
+  report_stats << ['Q3', stats[:q3]]
+  return report_stats
+end
+####################################################################################
+## OPTPARSE
+####################################################################################
+options = {}
+OptionParser.new do |opts|
+  opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
+  options[:download] = nil
+  opts.on("-d", "--download STRING", "Download obo file from official resource. MONDO, GO and HPO are possible values.") do |item|
+    options[:download] = item
+  end
+  options[:input_file] = nil
+  opts.on("-i", "--input_file PATH", "Filepath of profile data") do |item|
+    options[:input_file] = item
+  end
+  options[:output_file] = nil
+  opts.on("-o", "--output_file PATH", "Output filepath") do |item|
+    options[:output_file] = item
+  end
+  options[:IC] = false
+  opts.on("-I", "--IC", "Get IC") do
+    options[:IC] = true
+  end
+  options[:ontology_file] = nil
+  opts.on("-O PATH", "--ontology_file PATH", "Path to ontology file") do |item|
+  	options[:ontology_file] = item
+  end
+  options[:term_filter] = nil
+  opts.on("-T STRING", "--term_filter STRING", "If specified, only terms that are descendants of the specified term will be kept on a profile when cleaned") do |item|
+  	options[:term_filter] = item.to_sym
+  end
+  options[:translate] = nil
+  opts.on("-t STRING", "--translate STRING", "Translate to 'names' or to 'codes'") do |item|
+  	options[:translate] = item
+  end
+  opts.on("-s method", "--similarity method", "Calculate similarity between profile IDs computed by 'resnik', 'lin' or 'jiang_conrath' methods. ") do |sim_method|
+  	options[:similarity] = sim_method.to_sym
+  end
+  options[:clean_profiles] = false
+	opts.on("-c", "--clean_profiles", "Removes ancestors, descendants and obsolete terms from profiles") do
+  	options[:clean_profiles] = true
+  end
+  options[:removed_path] = 'rejected_profs'
+  opts.on("-r PATH", "--removed_path PATH", "Desired path to write removed profiles file") do |item|
+  	options[:removed_path] = item
+  end
+  options[:untranslated_path] = nil
+  opts.on("-u PATH", "--untranslated_path PATH", "Desired path to write untranslated terms file") do |item|
+    options[:untranslated_path] = item
+  end
+  options[:keyword] = nil
+  opts.on("-k STRING", "--keyword STRING", "regex used to get xref terms in the ontology file") do |item|
+    options[:keyword] = item
+  end
+  options[:xref_sense] = :byValue
+  opts.on("--xref_sense ", "Ontology-xref or xref-ontology. By default xref-ontology if set, ontology-xref") do
+    options[:xref_sense] = :byTerm
+  end
+  options[:expand_profiles] = false
+  opts.on("-e", "--expand_profiles", "Expand profiles adding ancestors") do
+    options[:expand_profiles] = true
+  end
+  options[:unwanted_terms] = []
+  opts.on("-U", "--unwanted_terms STRING", "Comma separated terms not wanted to be included in profile expansion") do |item|
+    options[:unwanted_terms] = item
+  end
+  options[:separator] = ";"
+  opts.on("-S STRING", "--separator STRING", "Separator used for the terms profile") do |sep|
+    options[:separator] = sep
+  end
+  options[:childs] = [[], '']
+  opts.on("-C STRING", "--childs STRING", "Term code list (comma separated) to generate child list") do |item|
+    if item.include?('/')
+      modifiers, terms = item.split('/')
+    else
+      modifiers = ''
+      terms = item
+    end
+    terms = terms.split(',').map{|t| t.to_sym}
+    options[:childs] = [terms, modifiers]
+  end
+  options[:statistics] = false
+  opts.on("-n", "--statistics", "To obtain main statistical descriptors of the profiles file") do
+    options[:statistics] = true
+  end
+  options[:list_translate] = nil
+  opts.on("-l STRING", "--list_translate STRING", "Translate to 'names' or to 'codes' input list") do |sep|
+    options[:list_translate] = sep
+  end
+  options[:subject_column] = 0
+  opts.on("-f NUM", "--subject_column NUM", "The number of the column for the subject id") do |ncol|
+    options[:subject_column] = ncol.to_i
+  end
+  options[:annotations_column] = 1
+  opts.on("-a NUM", "--annotations_column NUM", "The number of the column for the annotation ids") do |ncol|
+    options[:annotations_column] = ncol.to_i
+  end
+  options[:list_term_attributes] = false
+  opts.on("--list_term_attributes", "The number of the column for the annotation ids") do
+    options[:list_term_attributes] = true
+  end
+end.parse!
+####################################################################################
+## MAIN
+####################################################################################
+ont_index_file = File.join(EXTERNAL_DATA, 'ontologies.txt')
+if !options[:download].nil?
+  download(ont_index_file, options[:download], options[:output_file])
+  Process.exit
+end
+if !options[:ontology_file].nil?
+  options[:ontology_file] = get_ontology_file(options[:ontology_file], ont_index_file)
+end
+ontology = Ontology.new(file: options[:ontology_file], load_file: true)
+if !options[:input_file].nil?
+  data = load_tabular_file(options[:input_file])
+  if options[:list_translate].nil? || !options[:keyword].nil?
+    data.map!{|row|
+      [row[options[:subject_column]],
+       row[options[:annotations_column]].split(options[:separator]).map!{|term| term.to_sym}]
+    }
+    store_profiles(data, ontology) if options[:translate] != 'codes' && options[:keyword].nil?
+  end
+end
+if !options[:list_translate].nil?
+  data.each do |term|
+    if options[:list_translate] == 'names'
+      translation, untranslated = ontology.translate_ids(term)
+    elsif options[:list_translate] == 'codes'
+      translation, untranslated = ontology.translate_names(term)
+    end
+    puts "#{term.first}\t#{translation.empty? ? '-' : translation.first}"
+  end
+  Process.exit
+end
+if options[:translate] == 'codes'
+  profiles = {}
+  data.each do |id, terms|
+    load_value(profiles, id, terms)
+    profiles[id] = terms.split(options[:separator])
+  end
+  translate(ontology, 'codes', options, profiles)
+  store_profiles(profiles, ontology)
+end
+if options[:clean_profiles]
+	removed_profiles = clean_profiles(ontology.profiles, ontology, options)
+	if !removed_profiles.nil? && !removed_profiles.empty?
+      File.open(options[:removed_path], 'w') do |f|
+          removed_profiles.each do |profile|
+              f.puts profile
+          end
+      end
+	end
+end
+if options[:expand_profiles]
+  expanded_profiles = expand_profiles(ontology.profiles, ontology, options[:unwanted_terms])
+end
+if !options[:similarity].nil?
+  write_similarity_profile_list(input = options[:input_file], onto_obj=ontology, similarity_type = options[:similarity])
+end
+if options[:IC]
+  ontology.add_observed_terms_from_profiles
+  by_ontology, by_freq = ontology.get_profiles_resnik_dual_ICs
+  ic_file = File.basename(options[:input_file], ".*")+'_IC_onto_freq'
+  File.open(ic_file , 'w') do |file|
+    ontology.profiles.keys.each do |id|
+        file.puts([id, by_ontology[id], by_freq[id]].join("\t"))
+    end
+  end
+end
+if options[:translate] == 'names'
+  translate(ontology, 'names', options)
+end
+if !options[:childs].first.empty?
+  terms, modifiers = options[:childs]
+  all_childs = []
+  terms.each do |term|
+   childs = ontology.get_descendants(term)
+   all_childs = all_childs | childs
+  end
+  if modifiers.include?('r')
+    relations = []
+    all_childs = all_childs | terms # Add parents that generated child list
+    all_childs.each do |term|
+      descendants = ontology.get_direct_descendants(term)
+      if !descendants.nil?
+        descendants.each do |desc|
+          relations << [term, desc]
+        end
+      end
+    end
+    relations.each do |rel|
+      rel, _ = ontology.translate_ids(rel) if modifiers.include?('n')
+      puts rel.join("\t")
+    end
+  else
+    all_childs.each do |c|
+      if modifiers.include?('n')
+        puts ontology.translate_id(c)
+      else
+        puts c
+      end
+    end
+  end
+end
+if !options[:output_file].nil?
+  File.open(options[:output_file], 'w') do |file|
+    ontology.profiles.each do |id, terms|
+      file.puts([id, terms.join("|")].join("\t"))
+    end
+  end
+end
+if options[:statistics]
+  get_stats(ontology.profile_stats).each do |stat|
+    puts stat.join("\t")
+  end
+end
+if options[:list_term_attributes]
+  term_attributes = ontology.list_term_attributes
+  term_attributes.each do |t_attr|
+    t_attr[0] = t_attr[0].to_s
+    puts t_attr.join("\t")
+  end
+end
+if !options[:keyword].nil?
+  xref_translated = []
+  ontology.calc_dictionary(:xref, select_regex: /(#{options[:keyword]})/, store_tag: :tag, multiterm: true, substitute_alternatives: false)
+  dict = ontology.dicts[:tag][options[:xref_sense]]
+  data.each do |id, prof|
+    xrefs = []
+    prof.each do |t|
+      query = dict[t.to_s]
+      xrefs.concat(query) if !query.nil?
+    end
+    xref_translated << [id, xrefs] if !xrefs.empty?
+  end
+  File.open(options[:output_file], 'w') do |f|
+    xref_translated.each do |id, prof|
+      prof.each do |t|
+        f.puts [id, t].join("\t")
+      end
+    end
+  end
+end

data/bin/strsimnet.rb CHANGED Viewed

@@ -111,12 +111,11 @@ texts2compare = load_table_file(input_file = options[:input_file],
                                  targetCol = options[:cindex],
                                  filterCol = options[:findex],
                                  filterValue = options[:filter_value])
 # Verbose point
 puts "Calculating similitude for (" + texts2compare.length.to_s + ") elements"
 # Obtain all Vs all
-similitudes_AllVsAll = similitude_network(texts2compare,options[:rm_char])
+similitudes_AllVsAll = similitude_network(texts2compare, charsToRemove: options[:rm_char])
 # Verbose point
 puts "Writing output file ..."

data/external_data/ontologies.txt ADDED Viewed

@@ -0,0 +1,4 @@
+GO	http://purl.obolibrary.org/obo/go/go-basic.obo
+HPO	http://purl.obolibrary.org/obo/hp.obo
+MONDO	http://purl.obolibrary.org/obo/mondo.obo
+EFO	http://www.ebi.ac.uk/efo/efo.obo

data/lib/semtools/math_methods.rb CHANGED Viewed

@@ -1,148 +1,148 @@
-# TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
-#to cmpute fisher exact test
-#Fisher => http://www.biostathandbook.com/fishers.html
-def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
-	#puts '-', listA.inspect, listB.inspect, '-'
-	listA_listB = listA & listB
-	listA_nolistB = listA - listB
-	nolistA_listB = listB - listA
-	if weigths.nil?
-		listA_listB_count = listA_listB.length
-		listA_nolistB_count = listA_nolistB.length
-		nolistA_listB_count = nolistA_listB.length
-		nolistA_nolistB_count = all_elements_count - (listA | listB).length
-	else
-		# Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
-		# https://academic.oup.com/bioinformatics/article/22/13/1600/193669
-		listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
-		listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
-		nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
+# # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
+# #to cmpute fisher exact test
+# #Fisher => http://www.biostathandbook.com/fishers.html
+# def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
+# 	#puts '-', listA.inspect, listB.inspect, '-'
+# 	listA_listB = listA & listB
+# 	listA_nolistB = listA - listB
+# 	nolistA_listB = listB - listA
+# 	if weigths.nil?
+# 		listA_listB_count = listA_listB.length
+# 		listA_nolistB_count = listA_nolistB.length
+# 		nolistA_listB_count = nolistA_listB.length
+# 		nolistA_nolistB_count = all_elements_count - (listA | listB).length
+# 	else
+# 		# Fisher exact test weigthed as proposed in Improved scoring of functional groups from gene expression data by decorrelating GO graph structure
+# 		# https://academic.oup.com/bioinformatics/article/22/13/1600/193669
+# 		listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
+# 		listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
+# 		nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
-		if partial_weigths
-			nolistA_nolistB_count = all_elements_count - (listA | listB).length
-			all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
-		else
-			nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
-			all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
-		end
-	end
-	#puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
-	if tail == 'two_sided'
-		accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
-	elsif tail == 'less'
-		accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
-	end
-	return accumulated_prob
-end
+# 		if partial_weigths
+# 			nolistA_nolistB_count = all_elements_count - (listA | listB).length
+# 			all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
+# 		else
+# 			nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
+# 			all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
+# 		end
+# 	end
+# 	#puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
+# 	if tail == 'two_sided'
+# 		accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
+# 	elsif tail == 'less'
+# 		accumulated_prob = get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
+# 	end
+# 	return accumulated_prob
+# end
-def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
-	#https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
-	accumulated_prob = 0
-	ref_prob = compute_hyper_prob(
-		listA_listB_count,
-		listA_nolistB_count,
-		nolistA_listB_count,
-		nolistA_nolistB_count,
-		all_elements_count
-	)
-	accumulated_prob += ref_prob
-	[listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
-		n += 1
-		prob = compute_hyper_prob(
-			listA_listB_count - n,
-			listA_nolistB_count + n,
-			nolistA_listB_count + n,
-			nolistA_nolistB_count - n,
-			all_elements_count
-		)
-		prob <= ref_prob ? accumulated_prob += prob : break
-	end
+# def get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
+# 	#https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
+# 	accumulated_prob = 0
+# 	ref_prob = compute_hyper_prob(
+# 		listA_listB_count,
+# 		listA_nolistB_count,
+# 		nolistA_listB_count,
+# 		nolistA_nolistB_count,
+# 		all_elements_count
+# 	)
+# 	accumulated_prob += ref_prob
+# 	[listA_listB_count, nolistA_nolistB_count].min.times do |n| #less
+# 		n += 1
+# 		prob = compute_hyper_prob(
+# 			listA_listB_count - n,
+# 			listA_nolistB_count + n,
+# 			nolistA_listB_count + n,
+# 			nolistA_nolistB_count - n,
+# 			all_elements_count
+# 		)
+# 		prob <= ref_prob ? accumulated_prob += prob : break
+# 	end
-	[listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
-		n += 1
-		prob = compute_hyper_prob(
-			listA_listB_count + n,
-			listA_nolistB_count - n,
-			nolistA_listB_count - n,
-			nolistA_nolistB_count + n,
-			all_elements_count
-		)
-		accumulated_prob += prob if prob <= ref_prob
-	end
+# 	[listA_nolistB_count, nolistA_listB_count].min.times do |n| #greater
+# 		n += 1
+# 		prob = compute_hyper_prob(
+# 			listA_listB_count + n,
+# 			listA_nolistB_count - n,
+# 			nolistA_listB_count - n,
+# 			nolistA_nolistB_count + n,
+# 			all_elements_count
+# 		)
+# 		accumulated_prob += prob if prob <= ref_prob
+# 	end
-	return accumulated_prob
-end
+# 	return accumulated_prob
+# end
-def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
-	accumulated_prob = 0
-	[listA_listB_count, nolistA_nolistB_count].min.times do |n|
-		accumulated_prob += compute_hyper_prob(
-			listA_listB_count - n,
-			listA_nolistB_count + n,
-			nolistA_listB_count + n,
-			nolistA_nolistB_count - n,
-			all_elements_count
-		)
-	end
-	return accumulated_prob
-end
+# def get_less_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
+# 	accumulated_prob = 0
+# 	[listA_listB_count, nolistA_nolistB_count].min.times do |n|
+# 		accumulated_prob += compute_hyper_prob(
+# 			listA_listB_count - n,
+# 			listA_nolistB_count + n,
+# 			nolistA_listB_count + n,
+# 			nolistA_nolistB_count - n,
+# 			all_elements_count
+# 		)
+# 	end
+# 	return accumulated_prob
+# end
-def compute_hyper_prob(a, b, c, d, n)
-	# https://en.wikipedia.org/wiki/Fisher%27s_exact_test
-	binomA = binom(a + b, a)
-	binomC = binom(c + d, c)
-	divisor = binom(n, a + c)
-	return (binomA * binomC).fdiv(divisor)
-end
+# def compute_hyper_prob(a, b, c, d, n)
+# 	# https://en.wikipedia.org/wiki/Fisher%27s_exact_test
+# 	binomA = binom(a + b, a)
+# 	binomC = binom(c + d, c)
+# 	divisor = binom(n, a + c)
+# 	return (binomA * binomC).fdiv(divisor)
+# end
-def binom(n,k)
-	if k > 0 && k < n
-		res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
-	else
-		res = 1
-	end
-end
+# def binom(n,k)
+# 	if k > 0 && k < n
+# 		res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
+# 	else
+# 		res = 1
+# 	end
+# end
-#to cmpute adjusted pvalues
-#https://rosettacode.org/wiki/P-value_correction#Ruby
-def get_benjaminiHochberg_pvalues(arr_pvalues)
-	n = arr_pvalues.length
-	arr_o = order(arr_pvalues, true)
-	arr_cummin_input = []
-	(0..(n - 1)).each do |i|
-		arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
-	end
-	arr_ro = order(arr_o)
-	arr_cummin = cummin(arr_cummin_input)
-	arr_pmin = pmin(arr_cummin)
-	return arr_pmin.values_at(*arr_ro)
-end
+# #to cmpute adjusted pvalues
+# #https://rosettacode.org/wiki/P-value_correction#Ruby
+# def get_benjaminiHochberg_pvalues(arr_pvalues)
+# 	n = arr_pvalues.length
+# 	arr_o = order(arr_pvalues, true)
+# 	arr_cummin_input = []
+# 	(0..(n - 1)).each do |i|
+# 		arr_cummin_input[i] = (n / (n - i).to_f) * arr_pvalues[arr_o[i]]
+# 	end
+# 	arr_ro = order(arr_o)
+# 	arr_cummin = cummin(arr_cummin_input)
+# 	arr_pmin = pmin(arr_cummin)
+# 	return arr_pmin.values_at(*arr_ro)
+# end
-def order(array, decreasing = false)
-	if decreasing == false
-		array.sort.map { |n| array.index(n) }
-	else
-		array.sort.map { |n| array.index(n) }.reverse
-	end
-end
+# def order(array, decreasing = false)
+# 	if decreasing == false
+# 		array.sort.map { |n| array.index(n) }
+# 	else
+# 		array.sort.map { |n| array.index(n) }.reverse
+# 	end
+# end
-def cummin(array)
-	cumulative_min = array.first
-	arr_cummin = []
-	array.each do |p|
-		cumulative_min = [p, cumulative_min].min
-		arr_cummin << cumulative_min
-	end
-	return arr_cummin
-end
+# def cummin(array)
+# 	cumulative_min = array.first
+# 	arr_cummin = []
+# 	array.each do |p|
+# 		cumulative_min = [p, cumulative_min].min
+# 		arr_cummin << cumulative_min
+# 	end
+# 	return arr_cummin
+# end
-def pmin(array)
-	x = 1
-	pmin_array = []
-	array.each_index do |i|
-		pmin_array[i] = [array[i], x].min
-		abort if pmin_array[i] > 1
-	end
-	return pmin_array
-end
+# def pmin(array)
+# 	x = 1
+# 	pmin_array = []
+# 	array.each_index do |i|
+# 		pmin_array[i] = [array[i], x].min
+# 		abort if pmin_array[i] > 1
+# 	end
+# 	return pmin_array
+# end

data/lib/semtools/ontology.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require 'expcalc'
 require 'json'
 require 'colorize'
@@ -45,7 +46,7 @@ class Ontology
     @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
     @@multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider, :subsetdef, :synonymtypedef, :property_value, :remark]
     @@symbolizable_ids.concat(@@tags_with_trailing_modifiers)
     #############################################
     # CONSTRUCTOR
     #############################################
@@ -202,6 +203,7 @@ class Ontology
         # Only TERMS multivalue tags (future add Typedefs and Instance)
         # multivalue_tags = [:alt_id, :is_a, :subset, :synonym, :xref, :intersection_of, :union_of, :disjoint_from, :relationship, :replaced_by, :consider]
         attributes.each do |tag, value|
+            value.gsub!(/{source=[\\\":A-Za-z0-9\/\.\-, =]+} /, '') if tag == 'is_a' # To delete "source" attributes in is_a tag of MONDO ontology
             # Check
             raise EncodingError, 'Info element incorrect format' if (tag.nil?) || (value.nil?)
             # Prepare
@@ -553,14 +555,14 @@ class Ontology
         self.get_index_obsoletes
         self.get_index_alternatives
         self.get_index_child_parent_relations
-            @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
+            @alternatives_index.each{|k,v| @alternatives_index[k] = self.extract_id(v)}
             ## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
             @alternatives_index.compact!
-            @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
+            @obsoletes_index.each{|k,v| @obsoletes_index[k] = self.extract_id(v)}
             @obsoletes_index.compact!
-            @ancestors_index.map{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
+            @ancestors_index.each{|k,v| @ancestors_index[k] = v.map{|t| self.extract_id(t)}.compact}
             @ancestors_index.compact!
-            @descendants_index.map{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
+            @descendants_index.each{|k,v| @descendants_index[k] = v.map{|t| self.extract_id(t)}.compact}
             @descendants_index.compact!
         self.get_index_frequencies
         self.calc_dictionary(:name)
@@ -721,7 +723,7 @@ class Ontology
     # an array with all ancestors/descendants of given term or nil if parents are not available yet
     def get_familiar(term, return_ancestors = true, filter_alternatives = false)
         # Find into parentals
-        familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
+        familiars = return_ancestors ? @ancestors_index[term] : @descendants_index[term]
         if !familiars.nil?
             familiars = familiars.clone
             if filter_alternatives
@@ -1580,9 +1582,12 @@ class Ontology
         return terms_without_ancestors_and_alternatices
     end
-    def clean_profile_hard(profile)
+    def clean_profile_hard(profile, options = {})
         profile, _ = check_ids(profile)
         profile = profile.select{|t| !is_obsolete?(t)}
+        if !options[:term_filter].nil?
+            profile.select! {|term| get_ancestors(term).include?(options[:term_filter])}
+        end
         profile = clean_profile(profile.uniq)
         return profile
     end
@@ -1642,6 +1647,27 @@ class Ontology
     end
+    def get_profile_redundancy()
+      profile_sizes = self.get_profiles_sizes
+      parental_terms_per_profile = self.parentals_per_profile# clean_profiles
+      parental_terms_per_profile = parental_terms_per_profile.map{|item| item[0]}
+      profile_sizes, parental_terms_per_profile = profile_sizes.zip(parental_terms_per_profile).sort_by{|i| i.first}.reverse.transpose
+      return profile_sizes, parental_terms_per_profile
+    end
+    def compute_term_list_and_childs()
+      suggested_childs = {}
+      total_terms = 0
+      terms_with_more_specific_childs = 0
+      @profiles.each do |id, terms|
+        total_terms += terms.length
+        more_specific_childs = self.get_childs_table(terms, true)
+        terms_with_more_specific_childs += more_specific_childs.select{|hpo_record| !hpo_record.last.empty?}.length #Exclude phenotypes with no childs
+        suggested_childs[id] = more_specific_childs
+      end
+      return suggested_childs, terms_with_more_specific_childs.fdiv(total_terms)
+    end
     #  Calculates mean IC of a given profile
     # ===== Parameters
     # +prof+:: profile to be checked
@@ -2215,7 +2241,24 @@ class Ontology
         return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
     end
+    def each(att = false)
+        @stanzas[:terms].each do |id, tags|
+            next if @alternatives_index.include?(id)
+            if att
+               yield(id, tags)
+            else
+               yield(id)
+            end
+        end
+    end
+    def list_term_attributes
+        terms = []
+        each do |code|
+            terms << [code, translate_id(code), get_term_level(code)]
+        end
+        return terms
+    end
 #============================================================================
 #============================================================================
@@ -2414,6 +2457,28 @@ class Ontology
         return Math.log(pvalA)/Math.log(pvalB)
     end
+    def profile_stats
+      stats = Hash.new(0)
+      data = @profiles.values.map{|ont_ids| ont_ids.size}
+      stats[:average] = data.sum().fdiv(data.size)
+      sum_devs = data.sum{|element| (element - stats[:avg]) ** 2}
+      stats[:variance] = sum_devs.fdiv(data.size)
+      stats[:standardDeviation] = stats[:variance] ** 0.5
+      stats[:max] = data.max
+      stats[:min] = data.min
+      stats[:count] = data.size
+      data.each do |value|
+        stats[:countNonZero] += 1 if value != 0
+      end
+      stats[:q1] = data.get_quantiles(0.25)
+      stats[:median] = data.get_quantiles(0.5)
+      stats[:q3] = data.get_quantiles(0.75)
+      return stats
+    end
 #============================================================================
 #============================================================================

data/lib/semtools/sim_handler.rb CHANGED Viewed

@@ -92,7 +92,7 @@ end
 # +charsToRemove+:: char (or chars set) to be removed from texts to be compared
 # +unique+:: boolean flag which indicates if repeated elements must be removed
 # Returns the similarity percentage for all elements into array
-def similitude_network(items_array, splitChar = ";", charsToRemove = "", unique = false)
+def similitude_network(items_array, splitChar: ";", charsToRemove: "", unique: false)
   # Special cases
   return nil if items_array.nil?
   return nil if !items_array.is_a? Array

data/lib/semtools/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Semtools
-  VERSION = "0.1.6"
+  VERSION = "0.1.8"
 end

data/lib/semtools.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 require "semtools/version"
 require "semtools/sim_handler"
-require "semtools/math_methods"
 require "semtools/ontology"
 module Semtools

data/semtools.gemspec CHANGED Viewed

@@ -31,6 +31,8 @@ Gem::Specification.new do |spec|
   spec.require_paths = ["lib"]
   spec.add_dependency "text"
+  spec.add_dependency "down"
+  spec.add_dependency "expcalc"
   spec.add_development_dependency "rake"
   spec.add_development_dependency "rspec"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: semtools
 version: !ruby/object:Gem::Version
-  version: 0.1.6
+  version: 0.1.8
 platform: ruby
 authors:
 - seoanezonjic
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-05-27 00:00:00.000000000 Z
+date: 2022-03-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: text
@@ -25,6 +25,34 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: down
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: expcalc
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
@@ -75,6 +103,7 @@ email:
 executables:
 - console
 - onto2json.rb
+- semtools.rb
 - setup
 - strsimnet.rb
 extensions: []
@@ -90,8 +119,10 @@ files:
 - Rakefile
 - bin/console
 - bin/onto2json.rb
+- bin/semtools.rb
 - bin/setup
 - bin/strsimnet.rb
+- external_data/ontologies.txt
 - lib/data/hp.obo
 - lib/data/phenotype_annotation.tab
 - lib/semtools.rb
@@ -119,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.3
+rubygems_version: 3.2.15
 signing_key:
 specification_version: 4
 summary: Gem to handle semantic based calculations in text and defined ontologies