RubyGems - semtools - Versions diffs - 0.1.3 → 0.1.6 - Mend

semtools 0.1.3 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/semtools/math_methods.rb +11 -3
data/lib/semtools/ontology.rb +565 -123
data/lib/semtools/version.rb +1 -1
data/semtools.gemspec +1 -0
metadata +16 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e68630d42a4faf01dc15fdfa9f1acd64425ef1396ed6f9ce0a8d76319922ba06
-  data.tar.gz: 952d908af5370031df0f19c98ab69fbb59b51825f050b69714f4494e15f77f77
+  metadata.gz: 83746b4834f16f9bffc404c578be30e427bf90c7f43ba1f55bd04d94a29186c5
+  data.tar.gz: 25f0bd67c733d4289f2e790bc857886a184dd95677beaf378da9074957660e69
 SHA512:
-  metadata.gz: 85792433d82f824297df87cb0927b24116425ddb2a72a3e2f461748e014aa27f4efc8f73fcd7d1e6c423acd7487b77d21c2a8c0b7b0f8530030f6246ad62ad64
-  data.tar.gz: 2d0e0953f19d8c2cad2cc85a0c6d8c1cb9bf95f4dd1ee2d75aebcf15bdd3929d2938ede6544ed3f145ac5a8804b97af64f50a859ae7ecf8164f0ed4f07208fb2
+  metadata.gz: 602fd8d61f9e9f34c2de957dd4d311a510413eea2bb0d1794b4d2da6f4e8959d3919dd8e9ea4a0d9f5d4e962443b7f50675e075d385830aeb9ef08ecb38d3fe2
+  data.tar.gz: 5b9f66a1fef9c3296e5fe203330e3575e4fecf0f363fc8b884abd56e8fbcb1bd5dac3563792a4a5da908bcee409e2d36596751123aa2012d5cc9a8a5ff3c7796

data/lib/semtools/math_methods.rb CHANGED Viewed

@@ -1,7 +1,8 @@
 # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
 #to cmpute fisher exact test
 #Fisher => http://www.biostathandbook.com/fishers.html
-def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
+def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
+	#puts '-', listA.inspect, listB.inspect, '-'
 	listA_listB = listA & listB
 	listA_nolistB = listA - listB
 	nolistA_listB = listB - listA
@@ -16,9 +17,16 @@ def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', w
 		listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
 		listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
 		nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
-		nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
-		all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
+		if partial_weigths
+			nolistA_nolistB_count = all_elements_count - (listA | listB).length
+			all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
+		else
+			nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
+			all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
+		end
 	end
+	#puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
 	if tail == 'two_sided'
 		accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
 	elsif tail == 'less'

data/lib/semtools/ontology.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'json'
+require 'colorize'
 class Ontology
@@ -38,7 +39,7 @@ class Ontology
     # => @removable_terms :: array of terms to not be considered
     # => @term_paths :: metainfo about parental paths of each term
-    @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
+    @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
     @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
     @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
     @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
@@ -52,10 +53,11 @@ class Ontology
     # Instantiate a OBO_Handler object
     # ===== Parameters
     # +file+:: with info to be loaded (.obo ; .json)
-    # +load_file+:: activate load process automatically (only for .obo)
+    # +load_file+:: activate load process automatically
     # +removable_terms+: term to be removed from calcs
     # +build+: flag to launch metainfo calculation
-    def initialize(file: nil, load_file: false, removable_terms: [], build: true)
+    # +file_format+: force format type despite file extension. Can be :obo or :json
+    def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
         # Initialize object variables
         @header = nil
         @stanzas = {terms: {}, typedefs: {}, instances: {}}
@@ -74,9 +76,20 @@ class Ontology
         @items = {}
         @removable_terms = []
         @term_paths = {}
-        # Load if proceeds
         add_removable_terms(removable_terms) if !removable_terms.empty?
-        load(file, build: build) if load_file
+        load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
+        # Load if proceeds
+        if load_file
+            fformat = file_format
+            fformat = File.extname(file) if fformat.nil? && !file.nil?
+            if fformat == :obo || fformat == ".obo"
+                load(file, build: build)
+            elsif fformat == :json || fformat == ".json"
+                self.read(file, build: build)
+            elsif !fformat.nil?
+                warn 'Format not allowed. Loading process will not be performed'
+            end
+        end
     end
@@ -413,31 +426,54 @@ class Ontology
     # +bidirectional+:: calculate bidirectional similitude. Default: false
     # ===== Return
     # similitude calculated
-    def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
+    def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
         # Check
         raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
         raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
         micasA = []
         # Compare A -> B
         termsA.each do |tA|
-            micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
-            # Remove special cases
-            [false,nil].each do |err_value| micas.delete(err_value) end
-            # Obtain maximum value
-            micasA << micas.max if micas.length > 0
-            micasA << 0 if micas.length <= 0
-        end
-        means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
+            micas = []
+            termsB.each do |tB|
+                if store_mica
+                    value = @mica_index.dig(tA, tB)
+                else
+                    value = nil
+                end
+                if value.nil?
+                    value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
+                    if store_mica
+                        value = true if value.nil? # We use true to save that the operation was made but there is not mica value
+                        add2nestHash(@mica_index, tA, tB, value)
+                    end
+                end
+                micas << value if value.class == Float
+            end
+            if !micas.empty?
+                micasA << micas.max # Obtain maximum value
+            else
+                micasA << 0
+            end
+        end
+        means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
         # Compare B -> A
         if bidirectional
             means_simA = means_sim * micasA.size
-            means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
-            means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
+            means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
+            means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
         end
         # Return
         return means_sim
     end
+    def add2nestHash(h, key1, key2, val)
+        query1 = h[key1]
+        if query1.nil?
+            h[key1] = {key2 => val}
+        else
+            query1[key2] = val
+        end
+    end
     # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
     # ===== Parameters
@@ -462,12 +498,13 @@ class Ontology
             main_profiles = @profiles
         end
         # Compare
+        @mica_index = {}
         while !main_ids.empty?
             curr_id = main_ids.shift
             current_profile = main_profiles[curr_id]
             comp_ids.each do |id|
                 profile = comp_profiles[id]
-                value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
+                value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
                 query = profiles_similarity[curr_id]
                 if query.nil?
                   profiles_similarity[curr_id] = {id => value}
@@ -485,20 +522,23 @@ class Ontology
     # +alt_tag+:: tag used to expand alternative IDs
     # ===== Returns
     # true if process ends without errors and false in other cases
-    def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
+    def get_index_alternatives(alt_tag: @@basic_tags[:alternative].last)
         # Check input
         raise('stanzas terms empty')  if @stanzas[:terms].empty?
         # Take all alternative IDs
         alt_ids2add = {}
         @stanzas[:terms].each do |id, tags|
-            alt_ids = tags[alt_tag]
-            if !alt_ids.nil?
-                alt_ids = alt_ids - @removable_terms
-                # Update info
-                alt_ids.each do |alt_term|
-                    @alternatives_index[alt_term] = id
-                    alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
-                    @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
+            if id == tags[:id] # Avoid simulated alternative terms
+                # id = tags[:id] # Take always real ID in case of alternative terms simulted
+                alt_ids = tags[alt_tag]
+                if !alt_ids.nil?
+                    alt_ids = alt_ids - @removable_terms - [id]
+                    # Update info
+                    alt_ids.each do |alt_term|
+                        @alternatives_index[alt_term] = id
+                        alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
+                        @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
+                    end
                 end
             end
         end
@@ -510,10 +550,11 @@ class Ontology
     # ===== Returns
     # true if eprocess ends without errors and false in other cases
     def build_index()
-        self.get_index_alternatives
         self.get_index_obsoletes
+        self.get_index_alternatives
         self.get_index_child_parent_relations
             @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
+            ## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
             @alternatives_index.compact!
             @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
             @obsoletes_index.compact!
@@ -536,8 +577,6 @@ class Ontology
         if @ancestors_index.empty?
             warn('ancestors_index object is empty')
         else
-            # Prepare useful variables
-            alternative_terms = @alternatives_index.keys
             # Per each term, add frequencies
             @stanzas[:terms].each do |id, tags|
                 if @alternatives_index.include?(id)
@@ -556,8 +595,8 @@ class Ontology
                         @meta[id] = query
                     end
                     # Store metadata
-                    query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
-                    query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
+                    query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
+                    query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
                     query[:struct_freq] = query[:descendants] + 1.0
                     # Update maximums
                     @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
@@ -582,6 +621,7 @@ class Ontology
             # Check obsoletes
             @stanzas[:terms].each do |id, term_tags|
                 next if term_tags.nil?
+                next if self.is_alternative?(id)
                 query = term_tags[obs_tag]
                 if !query.nil? && query == 'true' # Obsolete tag presence
                     next if !@obsoletes_index[id].nil? # Already stored
@@ -633,10 +673,10 @@ class Ontology
                 end
             end
             # Store alternatives
-            @alternatives_index.each do |id,alt|
-                anc[id] = anc[alt] if anc.include?(alt)
-                des[id] = des[alt] if des.include?(alt)
-            end
+            # @alternatives_index.each do |id,alt|
+            #     anc[id] = anc[alt] if anc.include?(alt)
+            #     des[id] = des[alt] if des.include?(alt)
+            # end
             # Check structure
             if ![:atomic,:sparse].include? structType
                 structType = structType == :circular ? :circular : :hierarchical
@@ -704,12 +744,14 @@ class Ontology
     # the IC calculated
     def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
         term = termRaw.to_sym
+        curr_ics = @ics[type]
         # Check
         raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
         # Check if it's already calculated
-        return @ics[type][term] if (@ics[type].include? term) && !force
+        return curr_ics[term] if (curr_ics.include? term) && !force
         # Calculate
         ic = - 1
+        term_meta = @meta[term]
         case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf  |||  https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
             ###########################################
             #### STRUCTURE BASED METRICS
@@ -726,10 +768,10 @@ class Ontology
             ###########################################
             when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
                 # -log(Freq(x) / Max_Freq)
-                ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
+                ic = -Math.log10(term_meta[:struct_freq].fdiv(@max_freqs[:struct_freq]))
             when :resnik_observed
                 # -log(Freq(x) / Max_Freq)
-                ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
+                ic = -Math.log10(term_meta[:observed_freq].fdiv(@max_freqs[:observed_freq]))
             # Lin
             # Jiang & Conrath
@@ -745,17 +787,17 @@ class Ontology
             ###########################################
             when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
                 #  1 - ( log(hypo(x) + 1) / log(max_nodes) )
-                ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
+                ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
                 if :zhou # New Model of Semantic Similarity Measuring in Wordnet
                     # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
                     @ics[:seco][term] = ic # Special store
-                    ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
+                    ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(term_meta[:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
                 end
             when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
-                ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
+                ic = -Math.log10((term_meta[:descendants].fdiv(term_meta[:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
             # Knappe
         end
-        @ics[type][term] = ic
+        curr_ics[term] = ic
         return ic
     end
@@ -788,8 +830,8 @@ class Ontology
     # ===== Returns
     # the IC of the MICA(termA,termB)
     def get_ICMICA(termA, termB, ic_type = :resnik)
-        mica = self.get_MICA(termA, termB, ic_type)
-        return mica.first.nil? ? nil : mica.last
+        term, ic = self.get_MICA(termA, termB, ic_type)
+        return term.nil? ? nil : ic
     end
@@ -812,19 +854,12 @@ class Ontology
             # Obtain ancestors (include itselfs too)
             anc_A = self.get_ancestors(termA)
             anc_B = self.get_ancestors(termB)
             if !(anc_A.empty? && anc_B.empty?)
                 anc_A << termA
                 anc_B << termB
-                # Find shared ancestors
-                shared_ancestors = anc_A & anc_B
-                # Find MICA
-                if shared_ancestors.length > 0
-                    shared_ancestors.each do |anc|
-                        ic = self.get_IC(anc, type: ic_type)
-                        # Check
-                        mica = [anc,ic] if ic > mica[1]
-                    end
+                (anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
+                    ic = self.get_IC(anc, type: ic_type)
+                    mica = [anc,ic] if ic > mica[1]
                 end
             end
         end
@@ -844,9 +879,8 @@ class Ontology
         # Check
         raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
         sim = nil
-        # Launch comparissons
-        sim_res = get_ICMICA(termA, termB, ic_type)
-        if !sim_res.nil?
+        mica, sim_res = get_MICA(termA, termB, ic_type)
+        if !mica.nil?
             case type
                 when :resnik
                     sim = sim_res
@@ -922,6 +956,16 @@ class Ontology
         jsonFile = File.open(file)
         jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
         # Pre-process (Symbolize some hashs values)
+        if !jsonInfo[:header].nil?
+            aux = jsonInfo[:header].map do |entry,info|
+                if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
+                    [entry,info.map{|item| item.to_sym}]
+                else
+                    [entry,info]
+                end
+            end
+            jsonInfo[:header] = aux.to_h
+        end
         jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
         jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
         jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
@@ -1106,7 +1150,7 @@ class Ontology
                         if checked.nil?
                             t
                         else
-                            byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
+                            byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
                             checked
                         end
                     end
@@ -1134,7 +1178,8 @@ class Ontology
                             else
                                 aux = self.extract_id(referenceValue)
                             end
-                            referenceValue = aux if !aux.nil?
+                            aux.compact! unless aux.nil?
+                            referenceValue = aux unless aux.nil?
                         end
                         referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
                         byTerm[term] = referenceValue + (values - referenceValue)
@@ -1525,6 +1570,7 @@ class Ontology
     # ===== Returns
     # cleaned profile
     def clean_profile(profile, remove_alternatives: true)
+        warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
         terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
         if remove_alternatives
             terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
@@ -1534,6 +1580,43 @@ class Ontology
         return terms_without_ancestors_and_alternatices
     end
+    def clean_profile_hard(profile)
+        profile, _ = check_ids(profile)
+        profile = profile.select{|t| !is_obsolete?(t)}
+        profile = clean_profile(profile.uniq)
+        return profile
+    end
+    # Remove terms from a given profile using hierarchical info and scores set given
+    # ===== Parameters
+    # +profile+:: profile to be cleaned
+    # +scores+:: hash with terms by keys and numerical values (scores)
+    # +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
+    # +remove_without_score+:: if true, terms without score will be removed. Default: true
+    # ===== Returns
+    # cleaned profile
+    def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
+        scores = scores.sort_by{|term,score| score}.to_h
+        keep = profile.map do |term|
+            if scores.include?(term)
+                parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
+                targetable = parentals.select{|parent| profile.include?(parent)}
+                if targetable.empty?
+                    term
+                else
+                    targetable << term
+                    targets = scores.select{|term,score| targetable.include?(term)}.to_h
+                    byMax ? targets.keys.last : targets.keys.first
+                end
+            elsif remove_without_score
+                nil
+            else
+                term
+            end
+        end
+        return keep.compact.uniq
+    end
     # Remove alternatives (if official term is present) and ancestors terms of stored profiles
     # ===== Parameters
@@ -1635,44 +1718,45 @@ class Ontology
     # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
     # Also calculates paths metadata and stores into @term_paths
-    def calc_term_paths
-        self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
-        visited_terms = []
+    def calc_term_paths(only_main_terms=false)
+        self.calc_ancestors_dictionary if @dicts[:is_a].nil? # Calculate direct parentals dictionary if it's not already calculated
+        visited_terms = {} # PEDRO: To keep track of visited data, hash accesions are fast than array includes. I don't understant why use this variable instead of check @term_paths to see if the data is calculated
         @term_paths = {}
         if [:hierarchical, :sparse].include? @structureType
-            terms = @stanzas[:terms].keys
-            terms.each do |term|
-                if self.is_obsolete?(term) || self.is_alternative?(term)  # Special case (obsoletes)
+            @stanzas[:terms].each do |term, t_attributes|
+                if !only_main_terms && (self.is_obsolete?(term) || self.is_alternative?(term))  # Special case (obsoletes)
                     special_term = term
                     term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
                     @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
                     @term_paths[special_term] = @term_paths[term]
-                    visited_terms << special_term
+                    visited_terms[special_term] = true
                 end
                 if !visited_terms.include?(term)
-                    @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
+                    # PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
+                    path_attr = @term_paths[term]
+                    if path_attr.nil?
+                        path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
+                        @term_paths[term] = path_attr #save path data container
+                    end
                     parentals = @dicts[:is_a][:byTerm][term]
                     if parentals.nil?
-                        @term_paths[term][:paths] << [term]
+                        path_attr[:paths] << [term]
                     else
                         parentals.each do |direct_parental|
-                            if visited_terms.include? direct_parental # Use direct_parental already calculated paths
-                                new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
-                            else # Calculate new paths
-                                self.expand_path(direct_parental, visited_terms)
-                                new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
-                            end
-                            new_paths.each{|path| @term_paths[term][:paths] << path}
+                            self.expand_path(direct_parental)
+                            new_paths = @term_paths[direct_parental][:paths]
+                            path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
                         end
-                    end
-                    visited_terms << term
+                    end
+                    anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
+                    visited_terms[term] = true
                 end
                 # Update metadata
-                @term_paths[term][:total_paths] = @term_paths[term][:paths].length
-                paths_sizes = @term_paths[term][:paths].map{|path| path.length}
-                @term_paths[term][:largest_path] = paths_sizes.max
-                @term_paths[term][:shortest_path] = paths_sizes.min
+                path_attr = @term_paths[term]
+                path_attr[:total_paths] = path_attr[:paths].length
+                paths_sizes = path_attr[:paths].map{|path| path.length}
+                path_attr[:largest_path] = paths_sizes.max
+                path_attr[:shortest_path] = paths_sizes.min
             end
         else
             warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
@@ -1684,20 +1768,25 @@ class Ontology
     # ===== Parameters
     # +curr_term+:: current visited term
     # +visited_terms+:: already expanded terms
-    def expand_path(curr_term, visited_terms)
-        if !visited_terms.include?(curr_term) # Not already expanded
-            @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
+    def expand_path(curr_term)
+        if !@term_paths.include?(curr_term)
+            path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
+            @term_paths[curr_term] = path_attr
             direct_parentals = @dicts[:is_a][:byTerm][curr_term]
             if direct_parentals.nil? # No parents :: End of recurrence
-                @term_paths[curr_term][:paths] << [curr_term]
+                path_attr[:paths] << [curr_term]
             else # Expand and concat
                 direct_parentals.each do |ancestor|
-                    self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
-                    new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
-                    new_paths.each{|path| @term_paths[curr_term][:paths] << path}
+                    path_attr_parental = @term_paths[ancestor]
+                    if path_attr_parental.nil? # Calculate new paths
+                        self.expand_path(ancestor)
+                        new_paths = @term_paths[ancestor][:paths]
+                    else # Use direct_parental paths already calculated
+                        new_paths = path_attr_parental[:paths]
+                    end
+                    path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
                 end
             end
-            visited_terms << curr_term
         end
     end
@@ -1717,6 +1806,26 @@ class Ontology
         return @dicts[:level][:byValue][term]
     end
+    # nil, term not found, [] term exists but not has parents
+    def get_parental_path(term, which_path = :shortest_path, level = 0)
+        path = nil
+        path_attr = @term_paths[term]
+        if !path_attr.nil?
+            path_length = path_attr[which_path]
+            all_paths = path_attr[:paths]
+            if all_paths.empty?
+                path = []
+            else
+                path = all_paths.select{|pt| pt.length == path_length}.first.clone
+                if level > 0 # we want the term and his ascendants until a specific level
+                    n_parents = path_length - level
+                    path = path[0..n_parents]
+                end
+                path.shift # Discard the term itself
+            end
+        end
+        return path
+    end
     # Return ontology levels from profile terms
     # ===== Returns
@@ -1737,6 +1846,83 @@ class Ontology
         return levels_filtered
     end
+    def get_profile_ontology_distribution_tables
+      cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
+      uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
+      hpo_ontology_levels = get_ontology_levels
+      total_ontology_terms = hpo_ontology_levels.values.flatten.length
+      total_cohort_terms = cohort_ontology_levels.values.flatten.length
+      total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
+      ontology_levels = []
+      distribution_percentage = []
+      hpo_ontology_levels.each do |level, terms|
+        cohort_terms = cohort_ontology_levels[level]
+        uniq_cohort_terms = uniq_cohort_ontology_levels[level]
+        if cohort_terms.nil? || uniq_cohort_terms.nil?
+          num = 0
+          u_num = 0
+        else
+          num = cohort_terms.length
+          u_num = uniq_cohort_terms.length
+        end
+        ontology_levels << [level, terms.length, num]
+        distribution_percentage << [
+          level,
+          (terms.length.fdiv(total_ontology_terms)*100).round(3),
+          (num.fdiv(total_cohort_terms)*100).round(3),
+          (u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
+        ]
+      end
+      ontology_levels.sort! { |x,y| x.first <=> y.first }
+      distribution_percentage.sort! { |x,y| x.first <=> y.first }
+      return ontology_levels, distribution_percentage
+    end
+    def get_dataset_specifity_index(mode)
+        ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
+        if mode == 'uniq'
+            observed_distribution = 3
+        elsif mode == 'weigthed'
+            observed_distribution = 2
+        end
+        max_terms = distribution_percentage.map{|row| row[1]}.max
+        maxL = nil
+        distribution_percentage.each do |level_info|
+            maxL = level_info.first if level_info[1] == max_terms
+        end
+        diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
+        diffL.select!{|dL| dL.last > 0}
+        lowSection = diffL.select{|dL| dL.first <= maxL}
+        highSection = diffL.select{|dL| dL.first > maxL}
+        dsi = nil
+        if highSection.empty?
+            dsi = 0
+        else
+            accumulated_weigth = 0
+            accumulated_weigthed_diffL = 0
+            hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
+            lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
+            dsi = hss.fdiv(lss)
+        end
+        return dsi
+    end
+    def get_weigthed_level_contribution(section, maxL, nLevels)
+        accumulated_weigthed_diffL = 0
+        section.each do |level, diff|
+            weightL = maxL - level
+            if weightL >= 0
+                weightL += 1
+            else
+                weightL = weightL.abs
+            end
+            accumulated_weigthed_diffL += diff * weightL
+        end
+        weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
+        return weigthed_contribution
+    end
     # Calculate profiles dictionary with Key= Term; Value = Profiles
     def calc_profiles_dictionary
@@ -1808,17 +1994,66 @@ class Ontology
             end
         end
         if expand
-            relations.each do |k,v|
-                if @items.keys.include?(k)
-                    @items[k] = (@items[k] + v).uniq
-                else
-                    @items[k] = v
-                end
-            end
+            @items = self.concatItems(@items,relations)
+            # relations.each do |k,v| # MUST UPDATE THIS USING A CONCAT SPECIFIC FUNCTION
+            #     if @items.keys.include?(k)
+            #         if v.kind_of?(Array)
+            #             @items[k] = (@items[k] + v).uniq
+            #         elsif v.kind_of?(Hash)
+            #             @items.merge!(relations) do |k, oldV, newV|
+            #                if oldV.kind_of?(Array)
+            #                  return (oldV + newV).uniq
+            #                else
+            #                  oldV = [oldV,newV]
+            #                end
+            #             end
+            #         elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
+            #             @items[k] = (@items[k] + [v]).uniq
+            #         else
+            #             @items[k] = [@items[k],v]
+            #         end
+            #     else
+            #         @items[k] = v
+            #     end
+            # end
         else
             @items.merge!(relations)
         end
-    end
+    end
+    # Internal function to concat two elements.
+    # ===== Parameters
+    # +itemA+:: item to be concatenated
+    # +itemB+:: item to be concatenated
+    # ===== Returns
+    # Concatenated objects
+    def concatItems(itemA,itemB)
+        # A is Array :: RETURN ARRAY
+            # A_array : B_array
+            # A_array : B_hash => NOT ALLOWED
+            # A_array : B_single => NOT ALLOWED
+        # A is Hash :: RETURN HASH
+            # A_hash : B_array => NOT ALLOWED
+            # A_hash : B_hash
+            # A_hash : B_single => NOT ALLOWED
+        # A is single element => RETURN ARRAY
+            # A_single : B_array
+            # A_single : B_hash => NOT ALLOWED
+            # A_single : B_single
+        concatenated = nil
+        if itemA.kind_of?(Array) && itemB.kind_of?(Array)
+            concatenated = (itemA + itemB).uniq
+        elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
+            concatenated = itemA.merge(itemB) do |k, oldV, newV|
+                self.concatItems(oldV,newV)
+            end
+        elsif itemB.kind_of?(Array)
+            concatenated = ([itemA] + itemB).uniq
+        elsif ![Array, Hash].include?(itemB.class)
+            concatenated = [itemA,itemB].uniq
+        end
+        return concatenated
+    end
     # Assign a dictionary already calculated as a items set.
@@ -1826,7 +2061,7 @@ class Ontology
     # +dictID+:: dictionary ID to be stored (:byTerm will be used)
     def set_items_from_dict(dictID, remove_old_relations = false)
         @items = {} if remove_old_relations
-        if(@dicts.keys.include?(dictID))
+        if !@dicts[dictID].nil?
             @items.merge(@dicts[dictID][:byTerm])
         else
             warn('Specified ID is not calculated. Dict will not be added as a items set')
@@ -1875,7 +2110,7 @@ class Ontology
             curr_keys.map do |term_expand|
                 to_infer = []
                 # Obtain childs
-                childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
+                childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
                 # Expand
                 if childs.length > 0 && minimum_childs == 1 # Special case
                     to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
@@ -1931,40 +2166,172 @@ class Ontology
     end
+    # Return direct ancestors/descendants of a given term
+    # ===== Parameters
+    # +term+:: which are requested
+    # +relation+:: can be :ancestor or :descendant
+    # +remove_alternatives+:: if true, alternatives will be removed
+    # ===== Returns
+    # Direct ancestors/descendants of given term or nil if any error occurs
+    def get_direct_related(term, relation, remove_alternatives: false)
+        if @dicts[:is_a].nil?
+            warn("Hierarchy dictionary is not already calculated. Returning nil")
+            return nil
+        end
+        target = nil
+        case relation
+            when :ancestor
+                target = :byTerm
+            when :descendant
+                target = :byValue
+            else
+                warn('Relation type not allowed. Returning nil')
+        end
+        return nil if target.nil?
+        query = @dicts[:is_a][target][term]
+        return query if query.nil?
+        query, _ = remove_alternatives_from_profile(query) if remove_alternatives
+        return query
+    end
+    # Return direct ancestors of a given term
+    # ===== Parameters
+    # +term+:: which ancestors are requested
+    # +remove_alternatives+:: if true, alternatives will be removed
+    # ===== Returns
+    # Direct ancestors of given term or nil if any error occurs
+    def get_direct_ancentors(term, remove_alternatives: false)
+        return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
+    end
+    # Return direct descendants of a given term
+    # ===== Parameters
+    # +term+:: which descendants are requested
+    # +remove_alternatives+:: if true, alternatives will be removed
+    # ===== Returns
+    # Direct descendants of given term or nil if any error occurs
+    def get_direct_descendants(term, remove_alternatives: false)
+        return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
+    end
+#============================================================================
+#============================================================================
     # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
     # ===== Parameters
     # ++::
     # ===== Returns
     # ...
-     def compute_relations_to_items(external_item_list, mode, thresold)
+     def compute_relations_to_items(external_item_list, total_items, mode, thresold)
+        terms_levels = list_terms_per_level_from_items
+        #puts terms_levels.inspect.yellow
+        connect_familiars!(terms_levels)
+        #puts terms_levels.inspect.blue
+        item_list_with_transf_parental = get_item_list_parental(terms_levels)
+        results = []
+        if mode == :elim
+            results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
+        elsif mode == :weight
+            results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
+        end
+        return results
+    end
+    def get_item_list_parental(terms_levels)
+        transfered_list = {}
+        parent_dict = @dicts[:is_a][:byTerm]
+        levels = terms_levels.keys.sort
+        while levels.length > 1
+            level = levels.pop
+            terms_levels[level].each do |term|
+                parents = parent_dict[term]
+                if parents.nil?
+                    next
+                elsif parents.length == 1
+                    parent = parents.first
+                else
+                    parent = (parents | terms_levels[level - 1]).first
+                end
+                term_it = @items[term]
+                parent_it = @items[parent]
+                curr_it = transfered_list[term]
+                parent_all_items = merge_groups([term_it, parent_it, curr_it])
+                transfered_list[parent] = parent_all_items if !parent_all_items.empty?
+                term_all_items = merge_groups([term_it, curr_it])
+                transfered_list[term] = term_all_items if !term_all_items.empty?
+            end
+        end
+        terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
+            transfered_list[term] = @items[term] if transfered_list[term].nil?
+        end
+        return transfered_list
+    end
+    def merge_groups(groups)
+        return groups.compact.inject([]){|it, a| it | a}
+    end
+    def list_terms_per_level_from_items
+        terms_levels = {}
+        @items.each do |term, items|
+          level = self.get_term_level(term)
+          query = terms_levels[level]
+          if query.nil?
+            terms_levels[level] = [term]
+          else
+            query << term
+          end
+        end
+        return terms_levels
+    end
+    def connect_familiars!(terms_levels)
+        levels = terms_levels.keys.sort
+        while levels.length > 1 # Process when current level has a parental level
+            level = levels.pop
+            parental_level = level - 1
+            parental_terms = terms_levels[parental_level]
+            if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
+                parental_terms = [] # Initialize required parental level
+                terms_levels[parental_level] = parental_terms
+                levels << parental_level
+            end
+            terms_levels[level].each do |term|
+                path_info = @term_paths[term]
+                shortest_path_length = path_info[:shortest_path]
+                path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
+                parental = path[1] # the first elements is the term itself
+                parental_terms << parental if !parental_terms.include?(parental)
+            end
+        end
+    end
+    def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
         results = []
         penalized_terms = {}
-        # terms_levels = get_terms_levels(@items_relations.keys)
-        terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
-        terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
-        terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
         levels = terms_levels.keys.sort
         levels.reverse_each do |level|
             terms_levels[level].each do |term|
-                associated_items = @items_relations[term]
-                if mode == :elim
-                    items_to_remove = penalized_terms[term]
-                    items_to_remove = [] if items_to_remove.nil?
-                    pval = get_fisher_exact_test(
-                        external_item_list - items_to_remove,
-                        associated_items - items_to_remove,
-                        ((associated_items | external_item_list) - items_to_remove).length
-                        )
-                    if pval <= thresold
-                        parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
-                        parents.each do |prnt|
-                            query = penalized_terms[prnt]
-                            if query.nil?
-                                penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
-                            else
-                                query.concat(@items_relations[term])
-                            end
+                associated_items = item_list[term]
+                items_to_remove = penalized_terms[term]
+                items_to_remove = [] if items_to_remove.nil?
+                pval = get_fisher_exact_test(
+                    external_item_list - items_to_remove,
+                    associated_items - items_to_remove,
+                    #((associated_items | external_item_list) - items_to_remove).length
+                    total_items
+                    )
+                if pval <= thresold
+                    parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
+                    parents.each do |prnt|
+                        query = penalized_terms[prnt]
+                        if query.nil?
+                            penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
+                        else
+                            query.concat(item_list[term])
                         end
                     end
                 end
@@ -1974,6 +2341,81 @@ class Ontology
         return results
     end
+    def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
+        pvals = {}
+        item_weigths_per_term = Hash.new { |hash, key|  Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
+        levels = terms_levels.keys.sort
+        levels.reverse_each do |level|
+            terms_levels[level].each do |term|
+                associated_items = item_list[term]
+                #initialize observed items in item_weigths_per_term list
+                add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
+                children = @dicts[:is_a][:byValue][term]
+                if children.nil?
+                    children = []
+                else
+                    children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
+                end
+                computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
+            end
+        end
+        return pvals.to_a
+    end
+    def add_items_to_weigthed_list(term, associated_items, weigthed_list)
+        term_weigthing = weigthed_list[term]
+        associated_items.each{|ai| term_weigthing[ai] = 1}
+        weigthed_list[term] = term_weigthing
+    end
+    def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
+        #puts term.to_s.red
+        #puts @term_paths[term].inspect
+        #puts @dicts[:is_a][:byValue][term].inspect.light_blue
+        associated_items = item_weigths_per_term[term].keys
+        pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
+                                    'two_sided', item_weigths_per_term[term], true)
+        pvals[term] = pval
+        if children.length > 0
+            rates = {}
+            sig_child = 0
+            children.each do |child|
+                ratio = sigRatio(pvals[child], pval)
+                rates[child] = ratio
+                sig_child += 1 if ratio >= 1
+            end
+            if sig_child == 0 # CASE 1
+                children.each do |child|
+                    current_ratio = rates[child]
+                    query_child = item_weigths_per_term[child]
+                    query_child.transform_values!{|weight| weight * current_ratio}
+                    pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
+                                      'two_sided', item_weigths_per_term[child], true)
+                end
+            else
+                ancs = get_ancestors(term, filter_alternatives = true)
+                ancs << term
+                rates.each do |ch, ratio|# CASE 2
+                    if ratio >= 1 # The child is better than parent
+                        ancs.each do |anc|
+                            query_anc = item_weigths_per_term[anc]
+                            associated_items.each do |item|
+                                query_anc[item] /= ratio # /= --> query_anc[item]/ratio
+                            end
+                        end
+                    end
+                end
+                computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
+            end
+        end
+    end
+    def sigRatio(pvalA, pvalB)
+        return Math.log(pvalA)/Math.log(pvalB)
+    end
+#============================================================================
+#============================================================================
     # Check if a given ID is a removable (blacklist) term.
     # +DEPRECATED+ use is_removable? instead

data/lib/semtools/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Semtools
-  VERSION = "0.1.3"
+  VERSION = "0.1.6"
 end

data/semtools.gemspec CHANGED Viewed

@@ -34,4 +34,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "rake"
   spec.add_development_dependency "rspec"
+  spec.add_runtime_dependency 'colorize', '>= 0.7.3'
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: semtools
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.6
 platform: ruby
 authors:
 - seoanezonjic
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-03-22 00:00:00.000000000 Z
+date: 2021-05-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: text
@@ -53,6 +53,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: colorize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.7.3
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.7.3
 description: This gem allows to perform ontology based operations and calculation
   of Semantic similarity and information coefficient using different implementations.
 email: