RubyGems - semtools - Versions diffs - 0.1.3 → 0.1.6 - Mend

semtools 0.1.3 → 0.1.6

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/semtools/math_methods.rb +11 -3
data/lib/semtools/ontology.rb +565 -123
data/lib/semtools/version.rb +1 -1
data/semtools.gemspec +1 -0
metadata +16 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e68630d42a4faf01dc15fdfa9f1acd64425ef1396ed6f9ce0a8d76319922ba06
-  data.tar.gz: 952d908af5370031df0f19c98ab69fbb59b51825f050b69714f4494e15f77f77
+  metadata.gz: 83746b4834f16f9bffc404c578be30e427bf90c7f43ba1f55bd04d94a29186c5
+  data.tar.gz: 25f0bd67c733d4289f2e790bc857886a184dd95677beaf378da9074957660e69
 SHA512:
-  metadata.gz: 85792433d82f824297df87cb0927b24116425ddb2a72a3e2f461748e014aa27f4efc8f73fcd7d1e6c423acd7487b77d21c2a8c0b7b0f8530030f6246ad62ad64
-  data.tar.gz: 2d0e0953f19d8c2cad2cc85a0c6d8c1cb9bf95f4dd1ee2d75aebcf15bdd3929d2938ede6544ed3f145ac5a8804b97af64f50a859ae7ecf8164f0ed4f07208fb2
+  metadata.gz: 602fd8d61f9e9f34c2de957dd4d311a510413eea2bb0d1794b4d2da6f4e8959d3919dd8e9ea4a0d9f5d4e962443b7f50675e075d385830aeb9ef08ecb38d3fe2
+  data.tar.gz: 5b9f66a1fef9c3296e5fe203330e3575e4fecf0f363fc8b884abd56e8fbcb1bd5dac3563792a4a5da908bcee409e2d36596751123aa2012d5cc9a8a5ff3c7796

data/lib/semtools/math_methods.rb CHANGED Viewed

@@ -1,7 +1,8 @@
 # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
 #to cmpute fisher exact test
 #Fisher => http://www.biostathandbook.com/fishers.html
-def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
+def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
+	#puts '-', listA.inspect, listB.inspect, '-'
 	listA_listB = listA & listB
 	listA_nolistB = listA - listB
 	nolistA_listB = listB - listA
@@ -16,9 +17,16 @@ def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', w
 		listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
 		listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
 		nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
-		nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
-		all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
+		if partial_weigths
+			nolistA_nolistB_count = all_elements_count - (listA | listB).length
+			all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
+		else
+			nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
+			all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
+		end
 	end
+	#puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
 	if tail == 'two_sided'
 		accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
 	elsif tail == 'less'

data/lib/semtools/ontology.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'json'
+require 'colorize'
 class Ontology
@@ -38,7 +39,7 @@ class Ontology
     # => @removable_terms :: array of terms to not be considered
     # => @term_paths :: metainfo about parental paths of each term
-    @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
+    @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
     @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
     @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
     @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
@@ -52,10 +53,11 @@ class Ontology
     # Instantiate a OBO_Handler object
     # ===== Parameters
     # +file+:: with info to be loaded (.obo ; .json)
-    # +load_file+:: activate load process automatically (only for .obo)
+    # +load_file+:: activate load process automatically
     # +removable_terms+: term to be removed from calcs
     # +build+: flag to launch metainfo calculation
-    def initialize(file: nil, load_file: false, removable_terms: [], build: true)
+    # +file_format+: force format type despite file extension. Can be :obo or :json
+    def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
         # Initialize object variables
         @header = nil
         @stanzas = {terms: {}, typedefs: {}, instances: {}}
@@ -74,9 +76,20 @@ class Ontology
         @items = {}
         @removable_terms = []
         @term_paths = {}
-        # Load if proceeds
         add_removable_terms(removable_terms) if !removable_terms.empty?
-        load(file, build: build) if load_file
+        load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
+        # Load if proceeds
+        if load_file
+            fformat = file_format
+            fformat = File.extname(file) if fformat.nil? && !file.nil?
+            if fformat == :obo || fformat == ".obo"
+                load(file, build: build)
+            elsif fformat == :json || fformat == ".json"
+                self.read(file, build: build)
+            elsif !fformat.nil?
+                warn 'Format not allowed. Loading process will not be performed'
+            end
+        end
     end
@@ -413,31 +426,54 @@ class Ontology
     # +bidirectional+:: calculate bidirectional similitude. Default: false
     # ===== Return
     # similitude calculated
-    def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
+    def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
         # Check
         raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
         raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
         micasA = []
         # Compare A -> B
         termsA.each do |tA|
-            micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
-            # Remove special cases
-            [false,nil].each do |err_value| micas.delete(err_value) end
-            # Obtain maximum value
-            micasA << micas.max if micas.length > 0
-            micasA << 0 if micas.length <= 0
-        end
-        means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
+            micas = []
+            termsB.each do |tB|
+                if store_mica
+                    value = @mica_index.dig(tA, tB)
+                else
+                    value = nil
+                end
+                if value.nil?
+                    value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
+                    if store_mica
+                        value = true if value.nil? # We use true to save that the operation was made but there is not mica value
+                        add2nestHash(@mica_index, tA, tB, value)
+                    end
+                end
+                micas << value if value.class == Float
+            end
+            if !micas.empty?
+                micasA << micas.max # Obtain maximum value
+            else
+                micasA << 0
+            end
+        end
+        means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
         # Compare B -> A
         if bidirectional
             means_simA = means_sim * micasA.size
-            means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
-            means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
+            means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
+            means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
         end
         # Return
         return means_sim
     end
+    def add2nestHash(h, key1, key2, val)
+        query1 = h[key1]
+        if query1.nil?
+            h[key1] = {key2 => val}
+        else
+            query1[key2] = val
+        end
+    end
     # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
     # ===== Parameters
@@ -462,12 +498,13 @@ class Ontology
             main_profiles = @profiles
         end
         # Compare
+        @mica_index = {}
         while !main_ids.empty?
             curr_id = main_ids.shift
             current_profile = main_profiles[curr_id]
             comp_ids.each do |id|
                 profile = comp_profiles[id]
-                value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
+                value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
                 query = profiles_similarity[curr_id]
                 if query.nil?
                   profiles_similarity[curr_id] = {id => value}
@@ -485,20 +522,23 @@ class Ontology
     # +alt_tag+:: tag used to expand alternative IDs
     # ===== Returns
     # true if process ends without errors and false in other cases
-    def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
+    def get_index_alternatives(alt_tag: @@basic_tags[:alternative].last)
         # Check input
         raise('stanzas terms empty')  if @stanzas[:terms].empty?
         # Take all alternative IDs
         alt_ids2add = {}
         @stanzas[:terms].each do |id, tags|
-            alt_ids = tags[alt_tag]
-            if !alt_ids.nil?
-                alt_ids = alt_ids - @removable_terms
-                # Update info
-                alt_ids.each do |alt_term|
-                    @alternatives_index[alt_term] = id
-                    alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
-                    @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
+            if id == tags[:id] # Avoid simulated alternative terms
+                # id = tags[:id] # Take always real ID in case of alternative terms simulted
+                alt_ids = tags[alt_tag]
+                if !alt_ids.nil?
+                    alt_ids = alt_ids - @removable_terms - [id]
+                    # Update info
+                    alt_ids.each do |alt_term|
+                        @alternatives_index[alt_term] = id
+                        alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
+                        @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
+                    end
                 end
             end
         end
@@ -510,10 +550,11 @@ class Ontology
     # ===== Returns
     # true if eprocess ends without errors and false in other cases
     def build_index()
-        self.get_index_alternatives
         self.get_index_obsoletes
+        self.get_index_alternatives
         self.get_index_child_parent_relations
             @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
+            ## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
             @alternatives_index.compact!
             @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
             @obsoletes_index.compact!
@@ -536,8 +577,6 @@ class Ontology
         if @ancestors_index.empty?
             warn('ancestors_index object is empty')
         else
-            # Prepare useful variables
-            alternative_terms = @alternatives_index.keys
             # Per each term, add frequencies
             @stanzas[:terms].each do |id, tags|
                 if @alternatives_index.include?(id)
@@ -556,8 +595,8 @@ class Ontology
                         @meta[id] = query
                     end
                     # Store metadata
-                    query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
-                    query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
+                    query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
+                    query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
                     query[:struct_freq] = query[:descendants] + 1.0
                     # Update maximums
                     @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
@@ -582,6 +621,7 @@ class Ontology
             # Check obsoletes
             @stanzas[:terms].each do |id, term_tags|
                 next if term_tags.nil?
+                next if self.is_alternative?(id)
                 query = term_tags[obs_tag]
                 if !query.nil? && query == 'true' # Obsolete tag presence
                     next if !@obsoletes_index[id].nil? # Already stored
@@ -633,10 +673,10 @@ class Ontology
                 end
             end
             # Store alternatives
-            @alternatives_index.each do |id,alt|
-                anc[id] = anc[alt] if anc.include?(alt)
-                des[id] = des[alt] if des.include?(alt)
-            end
+            # @alternatives_index.each do |id,alt|
+            #     anc[id] = anc[alt] if anc.include?(alt)
+            #     des[id] = des[alt] if des.include?(alt)
+            # end
             # Check structure
             if ![:atomic,:sparse].include? structType
                 structType = structType == :circular ? :circular : :hierarchical
@@ -704,12 +744,14 @@ class Ontology
     # the IC calculated
     def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
         term = termRaw.to_sym
+        curr_ics = @ics[type]
         # Check
         raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
         # Check if it's already calculated
-        return @ics[type][term] if (@ics[type].include? term) && !force
+        return curr_ics[term] if (curr_ics.include? term) && !force
         # Calculate
         ic = - 1
+        term_meta = @meta[term]
         case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf  |||  https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
             ###########################################
             #### STRUCTURE BASED METRICS
@@ -726,10 +768,10 @@ class Ontology
             ###########################################
             when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
                 # -log(Freq(x) / Max_Freq)
-                ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
+                ic = -Math.log10(term_meta[:struct_freq].fdiv(@max_freqs[:struct_freq]))
             when :resnik_observed
                 # -log(Freq(x) / Max_Freq)
-                ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
+                ic = -Math.log10(term_meta[:observed_freq].fdiv(@max_freqs[:observed_freq]))
             # Lin
             # Jiang & Conrath
@@ -745,17 +787,17 @@ class Ontology
             ###########################################
             when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
                 #  1 - ( log(hypo(x) + 1) / log(max_nodes) )
-                ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
+                ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
                 if :zhou # New Model of Semantic Similarity Measuring in Wordnet
                     # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
                     @ics[:seco][term] = ic # Special store
-                    ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
+                    ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(term_meta[:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
                 end
             when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
-                ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
+                ic = -Math.log10((term_meta[:descendants].fdiv(term_meta[:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
             # Knappe
         end
-        @ics[type][term] = ic
+        curr_ics[term] = ic
         return ic
     end
@@ -788,8 +830,8 @@ class Ontology
     # ===== Returns
     # the IC of the MICA(termA,termB)
     def get_ICMICA(termA, termB, ic_type = :resnik)
-        mica = self.get_MICA(termA, termB, ic_type)
-        return mica.first.nil? ? nil : mica.last
+        term, ic = self.get_MICA(termA, termB, ic_type)
+        return term.nil? ? nil : ic
     end
@@ -812,19 +854,12 @@ class Ontology
             # Obtain ancestors (include itselfs too)
             anc_A = self.get_ancestors(termA)
             anc_B = self.get_ancestors(termB)
             if !(anc_A.empty? && anc_B.empty?)
                 anc_A << termA
                 anc_B << termB
-                # Find shared ancestors
-                shared_ancestors = anc_A & anc_B
-                # Find MICA
-                if shared_ancestors.length > 0
-                    shared_ancestors.each do |anc|
-                        ic = self.get_IC(anc, type: ic_type)
-                        # Check
-                        mica = [anc,ic] if ic > mica[1]
-                    end
+                (anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
+                    ic = self.get_IC(anc, type: ic_type)
+                    mica = [anc,ic] if ic > mica[1]
                 end
             end
         end
@@ -844,9 +879,8 @@ class Ontology
         # Check
         raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
         sim = nil
-        # Launch comparissons
-        sim_res = get_ICMICA(termA, termB, ic_type)
-        if !sim_res.nil?
+        mica, sim_res = get_MICA(termA, termB, ic_type)
+        if !mica.nil?
             case type
                 when :resnik
                     sim = sim_res
@@ -922,6 +956,16 @@ class Ontology
         jsonFile = File.open(file)
         jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
         # Pre-process (Symbolize some hashs values)
+        if !jsonInfo[:header].nil?
+            aux = jsonInfo[:header].map do |entry,info|
+                if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
+                    [entry,info.map{|item| item.to_sym}]
+                else
+                    [entry,info]
+                end
+            end
+            jsonInfo[:header] = aux.to_h
+        end
         jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
         jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
         jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
@@ -1106,7 +1150,7 @@ class Ontology
                         if checked.nil?
                             t
                         else
-                            byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
+                            byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
                             checked
                         end
                     end
@@ -1134,7 +1178,8 @@ class Ontology
                             else
                                 aux = self.extract_id(referenceValue)
                             end
-                            referenceValue = aux if !aux.nil?
+                            aux.compact! unless aux.nil?
+                            referenceValue = aux unless aux.nil?
                         end
                         referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
                         byTerm[term] = referenceValue + (values - referenceValue)
@@ -1525,6 +1570,7 @@ class Ontology
     # ===== Returns
     # cleaned profile
     def clean_profile(profile, remove_alternatives: true)
+        warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
         terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
         if remove_alternatives
             terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
@@ -1534,6 +1580,43 @@ class Ontology
         return terms_without_ancestors_and_alternatices
     end
+    def clean_profile_hard(profile)
+        profile, _ = check_ids(profile)
+        profile = profile.select{|t| !is_obsolete?(t)}
+        profile = clean_profile(profile.uniq)
+        return profile
+    end
+    # Remove terms from a given profile using hierarchical info and scores set given
+    # ===== Parameters
+    # +profile+:: profile to be cleaned
+    # +scores+:: hash with terms by keys and numerical values (scores)
+    # +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
+    # +remove_without_score+:: if true, terms without score will be removed. Default: true
+    # ===== Returns
+    # cleaned profile
+    def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
+        scores = scores.sort_by{|term,score| score}.to_h
+        keep = profile.map do |term|
+            if scores.include?(term)
+                parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
+                targetable = parentals.select{|parent| profile.include?(parent)}
+                if targetable.empty?
+                    term
+                else
+                    targetable << term
+                    targets = scores.select{|term,score| targetable.include?(term)}.to_h
+                    byMax ? targets.keys.last : targets.keys.first
+                end
+            elsif remove_without_score
+                nil
+            else
+                term
+            end
+        end
+        return keep.compact.uniq
+    end
     # Remove alternatives (if official term is present) and ancestors terms of stored profiles
     # ===== Parameters
@@ -1635,44 +1718,45 @@ class Ontology
     # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
     # Also calculates paths metadata and stores into @term_paths
-    def calc_term_paths
-        self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
-        visited_terms = []
+    def calc_term_paths(only_main_terms=false)
+        self.calc_ancestors_dictionary if @dicts[:is_a].nil? # Calculate direct parentals dictionary if it's not already calculated
+        visited_terms = {} # PEDRO: To keep track of visited data, hash accesions are fast than array includes. I don't understant why use this variable instead of check @term_paths to see if the data is calculated
         @term_paths = {}
         if [:hierarchical, :sparse].include? @structureType
-            terms = @stanzas[:terms].keys
-            terms.each do |term|
-                if self.is_obsolete?(term) || self.is_alternative?(term)  # Special case (obsoletes)
+            @stanzas[:terms].each do |term, t_attributes|
+                if !only_main_terms && (self.is_obsolete?(term) || self.is_alternative?(term))  # Special case (obsoletes)
                     special_term = term
                     term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
                     @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
                     @term_paths[special_term] = @term_paths[term]
-                    visited_terms << special_term
+                    visited_terms[special_term] = true
                 end
                 if !visited_terms.include?(term)
-                    @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
+                    # PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
+                    path_attr = @term_paths[term]
+                    if path_attr.nil?
+                        path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
+                        @term_paths[term] = path_attr #save path data container
+                    end
                     parentals = @dicts[:is_a][:byTerm][term]
                     if parentals.nil?
-                        @term_paths[term][:paths] << [term]
+                        path_attr[:paths] << [term]
                     else
                         parentals.each do |direct_parental|
-                            if visited_terms.include? direct_parental # Use direct_parental already calculated paths
-                                new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
-                            else # Calculate new paths
-                                self.expand_path(direct_parental, visited_terms)
-                                new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
-                            end
-                            new_paths.each{|path| @term_paths[term][:paths] << path}
+                            self.expand_path(direct_parental)
+                            new_paths = @term_paths[direct_parental][:paths]
+                            path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
                         end
-                    end
-                    visited_terms << term
+                    end
+                    anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
+                    visited_terms[term] = true
                 end
                 # Update metadata
-                @term_paths[term][:total_paths] = @term_paths[term][:paths].length
-                paths_sizes = @term_paths[term][:paths].map{|path| path.length}
-                @term_paths[term][:largest_path] = paths_sizes.max
-                @term_paths[term][:shortest_path] = paths_sizes.min
+                path_attr = @term_paths[term]
+                path_attr[:total_paths] = path_attr[:paths].length
+                paths_sizes = path_attr[:paths].map{|path| path.length}
+                path_attr[:largest_path] = paths_sizes.max
+                path_attr[:shortest_path] = paths_sizes.min
             end
         else
             warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
@@ -1684,20 +1768,25 @@ class Ontology
     # ===== Parameters
     # +curr_term+:: current visited term
     # +visited_terms+:: already expanded terms
-    def expand_path(curr_term, visited_terms)
-        if !visited_terms.include?(curr_term) # Not already expanded
-            @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
+    def expand_path(curr_term)
+        if !@term_paths.include?(curr_term)
+            path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
+            @term_paths[curr_term] = path_attr
             direct_parentals = @dicts[:is_a][:byTerm][curr_term]
             if direct_parentals.nil? # No parents :: End of recurrence
-                @term_paths[curr_term][:paths] << [curr_term]
+                path_attr[:paths] << [curr_term]
             else # Expand and concat
                 direct_parentals.each do |ancestor|
-                    self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
-                    new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
-                    new_paths.each{|path| @term_paths[curr_term][:paths] << path}
+                    path_attr_parental = @term_paths[ancestor]
+                    if path_attr_parental.nil? # Calculate new paths
+                        self.expand_path(ancestor)
+                        new_paths = @term_paths[ancestor][:paths]
+                    else # Use direct_parental paths already calculated
+                        new_paths = path_attr_parental[:paths]
+                    end
+                    path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
                 end
             end
-            visited_terms << curr_term
         end
     end
@@ -1717,6 +1806,26 @@ class Ontology
         return @dicts[:level][:byValue][term]
     end
+    # nil, term not found, [] term exists but not has parents
+    def get_parental_path(term, which_path = :shortest_path, level = 0)
+        path = nil
+        path_attr = @term_paths[term]
+        if !path_attr.nil?
+            path_length = path_attr[which_path]
+            all_paths = path_attr[:paths]
+            if all_paths.empty?
+                path = []
+            else
+                path = all_paths.select{|pt| pt.length == path_length}.first.clone
+                if level > 0 # we want the term and his ascendants until a specific level
+                    n_parents = path_length - level
+                    path = path[0..n_parents]
+                end
+                path.shift # Discard the term itself
+            end
+        end
+        return path
+    end
     # Return ontology levels from profile terms
     # ===== Returns
@@ -1737,6 +1846,83 @@ class Ontology
         return levels_filtered
     end
+    def get_profile_ontology_distribution_tables
+      cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
+      uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
+      hpo_ontology_levels = get_ontology_levels
+      total_ontology_terms = hpo_ontology_levels.values.flatten.length
+      total_cohort_terms = cohort_ontology_levels.values.flatten.length
+      total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
+      ontology_levels = []
+      distribution_percentage = []
+      hpo_ontology_levels.each do |level, terms|
+        cohort_terms = cohort_ontology_levels[level]
+        uniq_cohort_terms = uniq_cohort_ontology_levels[level]
+        if cohort_terms.nil? || uniq_cohort_terms.nil?
+          num = 0
+          u_num = 0
+        else
+          num = cohort_terms.length
+          u_num = uniq_cohort_terms.length
+        end
+        ontology_levels << [level, terms.length, num]
+        distribution_percentage << [
+          level,
+          (terms.length.fdiv(total_ontology_terms)*100).round(3),
+          (num.fdiv(total_cohort_terms)*100).round(3),
+          (u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
+        ]
+      end
+      ontology_levels.sort! { |x,y| x.first <=> y.first }
+      distribution_percentage.sort! { |x,y| x.first <=> y.first }
+      return ontology_levels, distribution_percentage
+    end
+    def get_dataset_specifity_index(mode)
+        ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
+        if mode == 'uniq'
+            observed_distribution = 3
+        elsif mode == 'weigthed'
+            observed_distribution = 2
+        end
+        max_terms = distribution_percentage.map{|row| row[1]}.max
+        maxL = nil
+        distribution_percentage.each do |level_info|
+            maxL = level_info.first if level_info[1] == max_terms
+        end
+        diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
+        diffL.select!{|dL| dL.last > 0}
+        lowSection = diffL.select{|dL| dL.first <= maxL}
+        highSection = diffL.select{|dL| dL.first > maxL}
+        dsi = nil
+        if highSection.empty?
+            dsi = 0
+        else
+            accumulated_weigth = 0
+            accumulated_weigthed_diffL = 0
+            hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
+            lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
+            dsi = hss.fdiv(lss)
+        end
+        return dsi
+    end
+    def get_weigthed_level_contribution(section, maxL, nLevels)
+        accumulated_weigthed_diffL = 0
+        section.each do |level, diff|
+            weightL = maxL - level
+            if weightL >= 0
+                weightL += 1
+            else
+                weightL = weightL.abs
+            end
+            accumulated_weigthed_diffL += diff * weightL
+        end
+        weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
+        return weigthed_contribution
+    end
     # Calculate profiles dictionary with Key= Term; Value = Profiles
     def calc_profiles_dictionary
@@ -1808,17 +1994,66 @@ class Ontology
             end
         end
         if expand
-            relations.each do |k,v|
-                if @items.keys.include?(k)
-                    @items[k] = (@items[k] + v).uniq
-                else
-                    @items[k] = v
-                end
-            end
+            @items = self.concatItems(@items,relations)
+            # relations.each do |k,v| # MUST UPDATE THIS USING A CONCAT SPECIFIC FUNCTION
+            #     if @items.keys.include?(k)
+            #         if v.kind_of?(Array)
+            #             @items[k] = (@items[k] + v).uniq
+            #         elsif v.kind_of?(Hash)
+            #             @items.merge!(relations) do |k, oldV, newV|
+            #                if oldV.kind_of?(Array)
+            #                  return (oldV + newV).uniq
+            #                else
+            #                  oldV = [oldV,newV]
+            #                end
+            #             end
+            #         elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
+            #             @items[k] = (@items[k] + [v]).uniq
+            #         else
+            #             @items[k] = [@items[k],v]
+            #         end
+            #     else
+            #         @items[k] = v
+            #     end
+            # end
         else
             @items.merge!(relations)
         end
-    end
+    end
+    # Internal function to concat two elements.
+    # ===== Parameters
+    # +itemA+:: item to be concatenated
+    # +itemB+:: item to be concatenated
+    # ===== Returns
+    # Concatenated objects
+    def concatItems(itemA,itemB)
+        # A is Array :: RETURN ARRAY
+            # A_array : B_array
+            # A_array : B_hash => NOT ALLOWED
+            # A_array : B_single => NOT ALLOWED
+        # A is Hash :: RETURN HASH
+            # A_hash : B_array => NOT ALLOWED
+            # A_hash : B_hash
+            # A_hash : B_single => NOT ALLOWED
+        # A is single element => RETURN ARRAY
+            # A_single : B_array
+            # A_single : B_hash => NOT ALLOWED
+            # A_single : B_single
+        concatenated = nil
+        if itemA.kind_of?(Array) && itemB.kind_of?(Array)
+            concatenated = (itemA + itemB).uniq
+        elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
+            concatenated = itemA.merge(itemB) do |k, oldV, newV|
+                self.concatItems(oldV,newV)
+            end
+        elsif itemB.kind_of?(Array)
+            concatenated = ([itemA] + itemB).uniq
+        elsif ![Array, Hash].include?(itemB.class)
+            concatenated = [itemA,itemB].uniq
+        end
+        return concatenated
+    end
     # Assign a dictionary already calculated as a items set.
@@ -1826,7 +2061,7 @@ class Ontology
     # +dictID+:: dictionary ID to be stored (:byTerm will be used)
     def set_items_from_dict(dictID, remove_old_relations = false)
         @items = {} if remove_old_relations
-        if(@dicts.keys.include?(dictID))
+        if !@dicts[dictID].nil?
             @items.merge(@dicts[dictID][:byTerm])
         else
             warn('Specified ID is not calculated. Dict will not be added as a items set')
@@ -1875,7 +2110,7 @@ class Ontology
             curr_keys.map do |term_expand|
                 to_infer = []
                 # Obtain childs
-                childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
+                childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
                 # Expand
                 if childs.length > 0 && minimum_childs == 1 # Special case
                     to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
@@ -1931,40 +2166,172 @@ class Ontology
     end
+    # Return direct ancestors/descendants of a given term
+    # ===== Parameters
+    # +term+:: which are requested
+    # +relation+:: can be :ancestor or :descendant
+    # +remove_alternatives+:: if true, alternatives will be removed
+    # ===== Returns
+    # Direct ancestors/descendants of given term or nil if any error occurs
+    def get_direct_related(term, relation, remove_alternatives: false)
+        if @dicts[:is_a].nil?
+            warn("Hierarchy dictionary is not already calculated. Returning nil")
+            return nil
+        end
+        target = nil
+        case relation
+            when :ancestor
+                target = :byTerm
+            when :descendant
+                target = :byValue
+            else
+                warn('Relation type not allowed. Returning nil')
+        end
+        return nil if target.nil?
+        query = @dicts[:is_a][target][term]
+        return query if query.nil?
+        query, _ = remove_alternatives_from_profile(query) if remove_alternatives
+        return query
+    end
+    # Return direct ancestors of a given term
+    # ===== Parameters
+    # +term+:: which ancestors are requested
+    # +remove_alternatives+:: if true, alternatives will be removed
+    # ===== Returns
+    # Direct ancestors of given term or nil if any error occurs
+    def get_direct_ancentors(term, remove_alternatives: false)
+        return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
+    end
+    # Return direct descendants of a given term
+    # ===== Parameters
+    # +term+:: which descendants are requested
+    # +remove_alternatives+:: if true, alternatives will be removed
+    # ===== Returns
+    # Direct descendants of given term or nil if any error occurs
+    def get_direct_descendants(term, remove_alternatives: false)
+        return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
+    end
+#============================================================================
+#============================================================================
     # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
     # ===== Parameters
     # ++::
     # ===== Returns
     # ...
-     def compute_relations_to_items(external_item_list, mode, thresold)
+     def compute_relations_to_items(external_item_list, total_items, mode, thresold)
+        terms_levels = list_terms_per_level_from_items
+        #puts terms_levels.inspect.yellow
+        connect_familiars!(terms_levels)
+        #puts terms_levels.inspect.blue
+        item_list_with_transf_parental = get_item_list_parental(terms_levels)
+        results = []
+        if mode == :elim
+            results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
+        elsif mode == :weight
+            results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
+        end
+        return results
+    end
+    def get_item_list_parental(terms_levels)
+        transfered_list = {}
+        parent_dict = @dicts[:is_a][:byTerm]
+        levels = terms_levels.keys.sort
+        while levels.length > 1
+            level = levels.pop
+            terms_levels[level].each do |term|
+                parents = parent_dict[term]
+                if parents.nil?
+                    next
+                elsif parents.length == 1
+                    parent = parents.first
+                else
+                    parent = (parents | terms_levels[level - 1]).first
+                end
+                term_it = @items[term]
+                parent_it = @items[parent]
+                curr_it = transfered_list[term]
+                parent_all_items = merge_groups([term_it, parent_it, curr_it])
+                transfered_list[parent] = parent_all_items if !parent_all_items.empty?
+                term_all_items = merge_groups([term_it, curr_it])
+                transfered_list[term] = term_all_items if !term_all_items.empty?
+            end
+        end
+        terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
+            transfered_list[term] = @items[term] if transfered_list[term].nil?
+        end
+        return transfered_list
+    end
+    def merge_groups(groups)
+        return groups.compact.inject([]){|it, a| it | a}
+    end
+    def list_terms_per_level_from_items
+        terms_levels = {}
+        @items.each do |term, items|
+          level = self.get_term_level(term)
+          query = terms_levels[level]
+          if query.nil?
+            terms_levels[level] = [term]
+          else
+            query << term
+          end
+        end
+        return terms_levels
+    end
+    def connect_familiars!(terms_levels)
+        levels = terms_levels.keys.sort
+        while levels.length > 1 # Process when current level has a parental level
+            level = levels.pop
+            parental_level = level - 1
+            parental_terms = terms_levels[parental_level]
+            if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
+                parental_terms = [] # Initialize required parental level
+                terms_levels[parental_level] = parental_terms
+                levels << parental_level
+            end
+            terms_levels[level].each do |term|
+                path_info = @term_paths[term]
+                shortest_path_length = path_info[:shortest_path]
+                path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
+                parental = path[1] # the first elements is the term itself
+                parental_terms << parental if !parental_terms.include?(parental)
+            end
+        end
+    end
+    def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
         results = []
         penalized_terms = {}
-        # terms_levels = get_terms_levels(@items_relations.keys)
-        terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
-        terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
-        terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
         levels = terms_levels.keys.sort
         levels.reverse_each do |level|
             terms_levels[level].each do |term|
-                associated_items = @items_relations[term]
-                if mode == :elim
-                    items_to_remove = penalized_terms[term]
-                    items_to_remove = [] if items_to_remove.nil?
-                    pval = get_fisher_exact_test(
-                        external_item_list - items_to_remove,
-                        associated_items - items_to_remove,
-                        ((associated_items | external_item_list) - items_to_remove).length
-                        )
-                    if pval <= thresold
-                        parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
-                        parents.each do |prnt|
-                            query = penalized_terms[prnt]
-                            if query.nil?
-                                penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
-                            else
-                                query.concat(@items_relations[term])
-                            end
+                associated_items = item_list[term]
+                items_to_remove = penalized_terms[term]
+                items_to_remove = [] if items_to_remove.nil?
+                pval = get_fisher_exact_test(
+                    external_item_list - items_to_remove,
+                    associated_items - items_to_remove,
+                    #((associated_items | external_item_list) - items_to_remove).length
+                    total_items
+                    )
+                if pval <= thresold
+                    parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
+                    parents.each do |prnt|
+                        query = penalized_terms[prnt]
+                        if query.nil?
+                            penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
+                        else
+                            query.concat(item_list[term])
                         end
                     end
                 end
@@ -1974,6 +2341,81 @@ class Ontology
         return results
     end
+    def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
+        pvals = {}
+        item_weigths_per_term = Hash.new { |hash, key|  Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
+        levels = terms_levels.keys.sort
+        levels.reverse_each do |level|
+            terms_levels[level].each do |term|
+                associated_items = item_list[term]
+                #initialize observed items in item_weigths_per_term list
+                add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
+                children = @dicts[:is_a][:byValue][term]
+                if children.nil?
+                    children = []
+                else
+                    children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
+                end
+                computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
+            end
+        end
+        return pvals.to_a
+    end
+    def add_items_to_weigthed_list(term, associated_items, weigthed_list)
+        term_weigthing = weigthed_list[term]
+        associated_items.each{|ai| term_weigthing[ai] = 1}
+        weigthed_list[term] = term_weigthing
+    end
+    def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
+        #puts term.to_s.red
+        #puts @term_paths[term].inspect
+        #puts @dicts[:is_a][:byValue][term].inspect.light_blue
+        associated_items = item_weigths_per_term[term].keys
+        pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
+                                    'two_sided', item_weigths_per_term[term], true)
+        pvals[term] = pval
+        if children.length > 0
+            rates = {}
+            sig_child = 0
+            children.each do |child|
+                ratio = sigRatio(pvals[child], pval)
+                rates[child] = ratio
+                sig_child += 1 if ratio >= 1
+            end
+            if sig_child == 0 # CASE 1
+                children.each do |child|
+                    current_ratio = rates[child]
+                    query_child = item_weigths_per_term[child]
+                    query_child.transform_values!{|weight| weight * current_ratio}
+                    pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
+                                      'two_sided', item_weigths_per_term[child], true)
+                end
+            else
+                ancs = get_ancestors(term, filter_alternatives = true)
+                ancs << term
+                rates.each do |ch, ratio|# CASE 2
+                    if ratio >= 1 # The child is better than parent
+                        ancs.each do |anc|
+                            query_anc = item_weigths_per_term[anc]
+                            associated_items.each do |item|
+                                query_anc[item] /= ratio # /= --> query_anc[item]/ratio
+                            end
+                        end
+                    end
+                end
+                computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
+            end
+        end
+    end
+    def sigRatio(pvalA, pvalB)
+        return Math.log(pvalA)/Math.log(pvalB)
+    end
+#============================================================================
+#============================================================================
     # Check if a given ID is a removable (blacklist) term.
     # +DEPRECATED+ use is_removable? instead

data/lib/semtools/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Semtools
-  VERSION = "0.1.3"
+  VERSION = "0.1.6"
 end

data/semtools.gemspec CHANGED Viewed

@@ -34,4 +34,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "rake"
   spec.add_development_dependency "rspec"
+  spec.add_runtime_dependency 'colorize', '>= 0.7.3'
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: semtools
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.6
 platform: ruby
 authors:
 - seoanezonjic
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-03-22 00:00:00.000000000 Z
+date: 2021-05-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: text
@@ -53,6 +53,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: colorize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.7.3
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.7.3
 description: This gem allows to perform ontology based operations and calculation
   of Semantic similarity and information coefficient using different implementations.
 email: