semtools 0.1.3 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e68630d42a4faf01dc15fdfa9f1acd64425ef1396ed6f9ce0a8d76319922ba06
4
- data.tar.gz: 952d908af5370031df0f19c98ab69fbb59b51825f050b69714f4494e15f77f77
3
+ metadata.gz: 83746b4834f16f9bffc404c578be30e427bf90c7f43ba1f55bd04d94a29186c5
4
+ data.tar.gz: 25f0bd67c733d4289f2e790bc857886a184dd95677beaf378da9074957660e69
5
5
  SHA512:
6
- metadata.gz: 85792433d82f824297df87cb0927b24116425ddb2a72a3e2f461748e014aa27f4efc8f73fcd7d1e6c423acd7487b77d21c2a8c0b7b0f8530030f6246ad62ad64
7
- data.tar.gz: 2d0e0953f19d8c2cad2cc85a0c6d8c1cb9bf95f4dd1ee2d75aebcf15bdd3929d2938ede6544ed3f145ac5a8804b97af64f50a859ae7ecf8164f0ed4f07208fb2
6
+ metadata.gz: 602fd8d61f9e9f34c2de957dd4d311a510413eea2bb0d1794b4d2da6f4e8959d3919dd8e9ea4a0d9f5d4e962443b7f50675e075d385830aeb9ef08ecb38d3fe2
7
+ data.tar.gz: 5b9f66a1fef9c3296e5fe203330e3575e4fecf0f363fc8b884abd56e8fbcb1bd5dac3563792a4a5da908bcee409e2d36596751123aa2012d5cc9a8a5ff3c7796
@@ -1,7 +1,8 @@
1
1
  # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
2
2
  #to cmpute fisher exact test
3
3
  #Fisher => http://www.biostathandbook.com/fishers.html
4
- def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
4
+ def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
5
+ #puts '-', listA.inspect, listB.inspect, '-'
5
6
  listA_listB = listA & listB
6
7
  listA_nolistB = listA - listB
7
8
  nolistA_listB = listB - listA
@@ -16,9 +17,16 @@ def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', w
16
17
  listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
17
18
  listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
18
19
  nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
19
- nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
20
- all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
20
+
21
+ if partial_weigths
22
+ nolistA_nolistB_count = all_elements_count - (listA | listB).length
23
+ all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
24
+ else
25
+ nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
26
+ all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
27
+ end
21
28
  end
29
+ #puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
22
30
  if tail == 'two_sided'
23
31
  accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
24
32
  elsif tail == 'less'
@@ -1,4 +1,5 @@
1
1
  require 'json'
2
+ require 'colorize'
2
3
 
3
4
 
4
5
  class Ontology
@@ -38,7 +39,7 @@ class Ontology
38
39
  # => @removable_terms :: array of terms to not be considered
39
40
  # => @term_paths :: metainfo about parental paths of each term
40
41
 
41
- @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
42
+ @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
42
43
  @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
43
44
  @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
44
45
  @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
@@ -52,10 +53,11 @@ class Ontology
52
53
  # Instantiate a OBO_Handler object
53
54
  # ===== Parameters
54
55
  # +file+:: with info to be loaded (.obo ; .json)
55
- # +load_file+:: activate load process automatically (only for .obo)
56
+ # +load_file+:: activate load process automatically
56
57
  # +removable_terms+: term to be removed from calcs
57
58
  # +build+: flag to launch metainfo calculation
58
- def initialize(file: nil, load_file: false, removable_terms: [], build: true)
59
+ # +file_format+: force format type despite file extension. Can be :obo or :json
60
+ def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
59
61
  # Initialize object variables
60
62
  @header = nil
61
63
  @stanzas = {terms: {}, typedefs: {}, instances: {}}
@@ -74,9 +76,20 @@ class Ontology
74
76
  @items = {}
75
77
  @removable_terms = []
76
78
  @term_paths = {}
77
- # Load if proceeds
78
79
  add_removable_terms(removable_terms) if !removable_terms.empty?
79
- load(file, build: build) if load_file
80
+ load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
81
+ # Load if proceeds
82
+ if load_file
83
+ fformat = file_format
84
+ fformat = File.extname(file) if fformat.nil? && !file.nil?
85
+ if fformat == :obo || fformat == ".obo"
86
+ load(file, build: build)
87
+ elsif fformat == :json || fformat == ".json"
88
+ self.read(file, build: build)
89
+ elsif !fformat.nil?
90
+ warn 'Format not allowed. Loading process will not be performed'
91
+ end
92
+ end
80
93
  end
81
94
 
82
95
 
@@ -413,31 +426,54 @@ class Ontology
413
426
  # +bidirectional+:: calculate bidirectional similitude. Default: false
414
427
  # ===== Return
415
428
  # similitude calculated
416
- def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
429
+ def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
417
430
  # Check
418
431
  raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
419
432
  raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
420
433
  micasA = []
421
434
  # Compare A -> B
422
435
  termsA.each do |tA|
423
- micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
424
- # Remove special cases
425
- [false,nil].each do |err_value| micas.delete(err_value) end
426
- # Obtain maximum value
427
- micasA << micas.max if micas.length > 0
428
- micasA << 0 if micas.length <= 0
429
- end
430
- means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
436
+ micas = []
437
+ termsB.each do |tB|
438
+ if store_mica
439
+ value = @mica_index.dig(tA, tB)
440
+ else
441
+ value = nil
442
+ end
443
+ if value.nil?
444
+ value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
445
+ if store_mica
446
+ value = true if value.nil? # We use true to save that the operation was made but there is not mica value
447
+ add2nestHash(@mica_index, tA, tB, value)
448
+ end
449
+ end
450
+ micas << value if value.class == Float
451
+ end
452
+ if !micas.empty?
453
+ micasA << micas.max # Obtain maximum value
454
+ else
455
+ micasA << 0
456
+ end
457
+ end
458
+ means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
431
459
  # Compare B -> A
432
460
  if bidirectional
433
461
  means_simA = means_sim * micasA.size
434
- means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
435
- means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
462
+ means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
463
+ means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
436
464
  end
437
465
  # Return
438
466
  return means_sim
439
467
  end
440
468
 
469
+ def add2nestHash(h, key1, key2, val)
470
+ query1 = h[key1]
471
+ if query1.nil?
472
+ h[key1] = {key2 => val}
473
+ else
474
+ query1[key2] = val
475
+ end
476
+ end
441
477
 
442
478
  # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
443
479
  # ===== Parameters
@@ -462,12 +498,13 @@ class Ontology
462
498
  main_profiles = @profiles
463
499
  end
464
500
  # Compare
501
+ @mica_index = {}
465
502
  while !main_ids.empty?
466
503
  curr_id = main_ids.shift
467
504
  current_profile = main_profiles[curr_id]
468
505
  comp_ids.each do |id|
469
506
  profile = comp_profiles[id]
470
- value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
507
+ value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
471
508
  query = profiles_similarity[curr_id]
472
509
  if query.nil?
473
510
  profiles_similarity[curr_id] = {id => value}
@@ -485,20 +522,23 @@ class Ontology
485
522
  # +alt_tag+:: tag used to expand alternative IDs
486
523
  # ===== Returns
487
524
  # true if process ends without errors and false in other cases
488
- def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
525
+ def get_index_alternatives(alt_tag: @@basic_tags[:alternative].last)
489
526
  # Check input
490
527
  raise('stanzas terms empty') if @stanzas[:terms].empty?
491
528
  # Take all alternative IDs
492
529
  alt_ids2add = {}
493
530
  @stanzas[:terms].each do |id, tags|
494
- alt_ids = tags[alt_tag]
495
- if !alt_ids.nil?
496
- alt_ids = alt_ids - @removable_terms
497
- # Update info
498
- alt_ids.each do |alt_term|
499
- @alternatives_index[alt_term] = id
500
- alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
501
- @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
531
+ if id == tags[:id] # Avoid simulated alternative terms
532
+ # id = tags[:id] # Take always real ID in case of alternative terms simulted
533
+ alt_ids = tags[alt_tag]
534
+ if !alt_ids.nil?
535
+ alt_ids = alt_ids - @removable_terms - [id]
536
+ # Update info
537
+ alt_ids.each do |alt_term|
538
+ @alternatives_index[alt_term] = id
539
+ alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
540
+ @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
541
+ end
502
542
  end
503
543
  end
504
544
  end
@@ -510,10 +550,11 @@ class Ontology
510
550
  # ===== Returns
511
551
  # true if eprocess ends without errors and false in other cases
512
552
  def build_index()
513
- self.get_index_alternatives
514
553
  self.get_index_obsoletes
554
+ self.get_index_alternatives
515
555
  self.get_index_child_parent_relations
516
556
  @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
557
+ ## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
517
558
  @alternatives_index.compact!
518
559
  @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
519
560
  @obsoletes_index.compact!
@@ -536,8 +577,6 @@ class Ontology
536
577
  if @ancestors_index.empty?
537
578
  warn('ancestors_index object is empty')
538
579
  else
539
- # Prepare useful variables
540
- alternative_terms = @alternatives_index.keys
541
580
  # Per each term, add frequencies
542
581
  @stanzas[:terms].each do |id, tags|
543
582
  if @alternatives_index.include?(id)
@@ -556,8 +595,8 @@ class Ontology
556
595
  @meta[id] = query
557
596
  end
558
597
  # Store metadata
559
- query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
560
- query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
598
+ query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
599
+ query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
561
600
  query[:struct_freq] = query[:descendants] + 1.0
562
601
  # Update maximums
563
602
  @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
@@ -582,6 +621,7 @@ class Ontology
582
621
  # Check obsoletes
583
622
  @stanzas[:terms].each do |id, term_tags|
584
623
  next if term_tags.nil?
624
+ next if self.is_alternative?(id)
585
625
  query = term_tags[obs_tag]
586
626
  if !query.nil? && query == 'true' # Obsolete tag presence
587
627
  next if !@obsoletes_index[id].nil? # Already stored
@@ -633,10 +673,10 @@ class Ontology
633
673
  end
634
674
  end
635
675
  # Store alternatives
636
- @alternatives_index.each do |id,alt|
637
- anc[id] = anc[alt] if anc.include?(alt)
638
- des[id] = des[alt] if des.include?(alt)
639
- end
676
+ # @alternatives_index.each do |id,alt|
677
+ # anc[id] = anc[alt] if anc.include?(alt)
678
+ # des[id] = des[alt] if des.include?(alt)
679
+ # end
640
680
  # Check structure
641
681
  if ![:atomic,:sparse].include? structType
642
682
  structType = structType == :circular ? :circular : :hierarchical
@@ -704,12 +744,14 @@ class Ontology
704
744
  # the IC calculated
705
745
  def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
706
746
  term = termRaw.to_sym
747
+ curr_ics = @ics[type]
707
748
  # Check
708
749
  raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
709
750
  # Check if it's already calculated
710
- return @ics[type][term] if (@ics[type].include? term) && !force
751
+ return curr_ics[term] if (curr_ics.include? term) && !force
711
752
  # Calculate
712
753
  ic = - 1
754
+ term_meta = @meta[term]
713
755
  case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
714
756
  ###########################################
715
757
  #### STRUCTURE BASED METRICS
@@ -726,10 +768,10 @@ class Ontology
726
768
  ###########################################
727
769
  when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
728
770
  # -log(Freq(x) / Max_Freq)
729
- ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
771
+ ic = -Math.log10(term_meta[:struct_freq].fdiv(@max_freqs[:struct_freq]))
730
772
  when :resnik_observed
731
773
  # -log(Freq(x) / Max_Freq)
732
- ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
774
+ ic = -Math.log10(term_meta[:observed_freq].fdiv(@max_freqs[:observed_freq]))
733
775
  # Lin
734
776
  # Jiang & Conrath
735
777
 
@@ -745,17 +787,17 @@ class Ontology
745
787
  ###########################################
746
788
  when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
747
789
  # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
748
- ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
790
+ ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
749
791
  if :zhou # New Model of Semantic Similarity Measuring in Wordnet
750
792
  # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
751
793
  @ics[:seco][term] = ic # Special store
752
- ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
794
+ ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(term_meta[:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
753
795
  end
754
796
  when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
755
- ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
797
+ ic = -Math.log10((term_meta[:descendants].fdiv(term_meta[:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
756
798
  # Knappe
757
799
  end
758
- @ics[type][term] = ic
800
+ curr_ics[term] = ic
759
801
  return ic
760
802
  end
761
803
 
@@ -788,8 +830,8 @@ class Ontology
788
830
  # ===== Returns
789
831
  # the IC of the MICA(termA,termB)
790
832
  def get_ICMICA(termA, termB, ic_type = :resnik)
791
- mica = self.get_MICA(termA, termB, ic_type)
792
- return mica.first.nil? ? nil : mica.last
833
+ term, ic = self.get_MICA(termA, termB, ic_type)
834
+ return term.nil? ? nil : ic
793
835
  end
794
836
 
795
837
 
@@ -812,19 +854,12 @@ class Ontology
812
854
  # Obtain ancestors (include itselfs too)
813
855
  anc_A = self.get_ancestors(termA)
814
856
  anc_B = self.get_ancestors(termB)
815
-
816
857
  if !(anc_A.empty? && anc_B.empty?)
817
858
  anc_A << termA
818
859
  anc_B << termB
819
- # Find shared ancestors
820
- shared_ancestors = anc_A & anc_B
821
- # Find MICA
822
- if shared_ancestors.length > 0
823
- shared_ancestors.each do |anc|
824
- ic = self.get_IC(anc, type: ic_type)
825
- # Check
826
- mica = [anc,ic] if ic > mica[1]
827
- end
860
+ (anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
861
+ ic = self.get_IC(anc, type: ic_type)
862
+ mica = [anc,ic] if ic > mica[1]
828
863
  end
829
864
  end
830
865
  end
@@ -844,9 +879,8 @@ class Ontology
844
879
  # Check
845
880
  raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
846
881
  sim = nil
847
- # Launch comparissons
848
- sim_res = get_ICMICA(termA, termB, ic_type)
849
- if !sim_res.nil?
882
+ mica, sim_res = get_MICA(termA, termB, ic_type)
883
+ if !mica.nil?
850
884
  case type
851
885
  when :resnik
852
886
  sim = sim_res
@@ -922,6 +956,16 @@ class Ontology
922
956
  jsonFile = File.open(file)
923
957
  jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
924
958
  # Pre-process (Symbolize some hashs values)
959
+ if !jsonInfo[:header].nil?
960
+ aux = jsonInfo[:header].map do |entry,info|
961
+ if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
962
+ [entry,info.map{|item| item.to_sym}]
963
+ else
964
+ [entry,info]
965
+ end
966
+ end
967
+ jsonInfo[:header] = aux.to_h
968
+ end
925
969
  jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
926
970
  jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
927
971
  jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
@@ -1106,7 +1150,7 @@ class Ontology
1106
1150
  if checked.nil?
1107
1151
  t
1108
1152
  else
1109
- byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
1153
+ byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
1110
1154
  checked
1111
1155
  end
1112
1156
  end
@@ -1134,7 +1178,8 @@ class Ontology
1134
1178
  else
1135
1179
  aux = self.extract_id(referenceValue)
1136
1180
  end
1137
- referenceValue = aux if !aux.nil?
1181
+ aux.compact! unless aux.nil?
1182
+ referenceValue = aux unless aux.nil?
1138
1183
  end
1139
1184
  referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1140
1185
  byTerm[term] = referenceValue + (values - referenceValue)
@@ -1525,6 +1570,7 @@ class Ontology
1525
1570
  # ===== Returns
1526
1571
  # cleaned profile
1527
1572
  def clean_profile(profile, remove_alternatives: true)
1573
+ warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
1528
1574
  terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1529
1575
  if remove_alternatives
1530
1576
  terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
@@ -1534,6 +1580,43 @@ class Ontology
1534
1580
  return terms_without_ancestors_and_alternatices
1535
1581
  end
1536
1582
 
1583
+ def clean_profile_hard(profile)
1584
+ profile, _ = check_ids(profile)
1585
+ profile = profile.select{|t| !is_obsolete?(t)}
1586
+ profile = clean_profile(profile.uniq)
1587
+ return profile
1588
+ end
1589
+
1590
+ # Remove terms from a given profile using hierarchical info and scores set given
1591
+ # ===== Parameters
1592
+ # +profile+:: profile to be cleaned
1593
+ # +scores+:: hash with terms by keys and numerical values (scores)
1594
+ # +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
1595
+ # +remove_without_score+:: if true, terms without score will be removed. Default: true
1596
+ # ===== Returns
1597
+ # cleaned profile
1598
+ def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
1599
+ scores = scores.sort_by{|term,score| score}.to_h
1600
+ keep = profile.map do |term|
1601
+ if scores.include?(term)
1602
+ parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
1603
+ targetable = parentals.select{|parent| profile.include?(parent)}
1604
+ if targetable.empty?
1605
+ term
1606
+ else
1607
+ targetable << term
1608
+ targets = scores.select{|term,score| targetable.include?(term)}.to_h
1609
+ byMax ? targets.keys.last : targets.keys.first
1610
+ end
1611
+ elsif remove_without_score
1612
+ nil
1613
+ else
1614
+ term
1615
+ end
1616
+ end
1617
+ return keep.compact.uniq
1618
+ end
1619
+
1537
1620
 
1538
1621
  # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1539
1622
  # ===== Parameters
@@ -1635,44 +1718,45 @@ class Ontology
1635
1718
 
1636
1719
  # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1637
1720
  # Also calculates paths metadata and stores into @term_paths
1638
- def calc_term_paths
1639
- self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
1640
- visited_terms = []
1721
+ def calc_term_paths(only_main_terms=false)
1722
+ self.calc_ancestors_dictionary if @dicts[:is_a].nil? # Calculate direct parentals dictionary if it's not already calculated
1723
+ visited_terms = {} # PEDRO: To keep track of visited data, hash accesions are fast than array includes. I don't understant why use this variable instead of check @term_paths to see if the data is calculated
1641
1724
  @term_paths = {}
1642
1725
  if [:hierarchical, :sparse].include? @structureType
1643
- terms = @stanzas[:terms].keys
1644
- terms.each do |term|
1645
- if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
1726
+ @stanzas[:terms].each do |term, t_attributes|
1727
+ if !only_main_terms && (self.is_obsolete?(term) || self.is_alternative?(term)) # Special case (obsoletes)
1646
1728
  special_term = term
1647
1729
  term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1648
1730
  @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1649
1731
  @term_paths[special_term] = @term_paths[term]
1650
- visited_terms << special_term
1732
+ visited_terms[special_term] = true
1651
1733
  end
1652
-
1653
1734
  if !visited_terms.include?(term)
1654
- @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1735
+ # PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
1736
+ path_attr = @term_paths[term]
1737
+ if path_attr.nil?
1738
+ path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
1739
+ @term_paths[term] = path_attr #save path data container
1740
+ end
1655
1741
  parentals = @dicts[:is_a][:byTerm][term]
1656
1742
  if parentals.nil?
1657
- @term_paths[term][:paths] << [term]
1743
+ path_attr[:paths] << [term]
1658
1744
  else
1659
1745
  parentals.each do |direct_parental|
1660
- if visited_terms.include? direct_parental # Use direct_parental already calculated paths
1661
- new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1662
- else # Calculate new paths
1663
- self.expand_path(direct_parental, visited_terms)
1664
- new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1665
- end
1666
- new_paths.each{|path| @term_paths[term][:paths] << path}
1746
+ self.expand_path(direct_parental)
1747
+ new_paths = @term_paths[direct_parental][:paths]
1748
+ path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
1667
1749
  end
1668
- end
1669
- visited_terms << term
1750
+ end
1751
+ anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
1752
+ visited_terms[term] = true
1670
1753
  end
1671
1754
  # Update metadata
1672
- @term_paths[term][:total_paths] = @term_paths[term][:paths].length
1673
- paths_sizes = @term_paths[term][:paths].map{|path| path.length}
1674
- @term_paths[term][:largest_path] = paths_sizes.max
1675
- @term_paths[term][:shortest_path] = paths_sizes.min
1755
+ path_attr = @term_paths[term]
1756
+ path_attr[:total_paths] = path_attr[:paths].length
1757
+ paths_sizes = path_attr[:paths].map{|path| path.length}
1758
+ path_attr[:largest_path] = paths_sizes.max
1759
+ path_attr[:shortest_path] = paths_sizes.min
1676
1760
  end
1677
1761
  else
1678
1762
  warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
@@ -1684,20 +1768,25 @@ class Ontology
1684
1768
  # ===== Parameters
1685
1769
  # +curr_term+:: current visited term
1686
1770
  # +visited_terms+:: already expanded terms
1687
- def expand_path(curr_term, visited_terms)
1688
- if !visited_terms.include?(curr_term) # Not already expanded
1689
- @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
1771
+ def expand_path(curr_term)
1772
+ if !@term_paths.include?(curr_term)
1773
+ path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
1774
+ @term_paths[curr_term] = path_attr
1690
1775
  direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1691
1776
  if direct_parentals.nil? # No parents :: End of recurrence
1692
- @term_paths[curr_term][:paths] << [curr_term]
1777
+ path_attr[:paths] << [curr_term]
1693
1778
  else # Expand and concat
1694
1779
  direct_parentals.each do |ancestor|
1695
- self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
1696
- new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
1697
- new_paths.each{|path| @term_paths[curr_term][:paths] << path}
1780
+ path_attr_parental = @term_paths[ancestor]
1781
+ if path_attr_parental.nil? # Calculate new paths
1782
+ self.expand_path(ancestor)
1783
+ new_paths = @term_paths[ancestor][:paths]
1784
+ else # Use direct_parental paths already calculated
1785
+ new_paths = path_attr_parental[:paths]
1786
+ end
1787
+ path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
1698
1788
  end
1699
1789
  end
1700
- visited_terms << curr_term
1701
1790
  end
1702
1791
  end
1703
1792
 
@@ -1717,6 +1806,26 @@ class Ontology
1717
1806
  return @dicts[:level][:byValue][term]
1718
1807
  end
1719
1808
 
1809
+ # nil, term not found, [] term exists but not has parents
1810
+ def get_parental_path(term, which_path = :shortest_path, level = 0)
1811
+ path = nil
1812
+ path_attr = @term_paths[term]
1813
+ if !path_attr.nil?
1814
+ path_length = path_attr[which_path]
1815
+ all_paths = path_attr[:paths]
1816
+ if all_paths.empty?
1817
+ path = []
1818
+ else
1819
+ path = all_paths.select{|pt| pt.length == path_length}.first.clone
1820
+ if level > 0 # we want the term and his ascendants until a specific level
1821
+ n_parents = path_length - level
1822
+ path = path[0..n_parents]
1823
+ end
1824
+ path.shift # Discard the term itself
1825
+ end
1826
+ end
1827
+ return path
1828
+ end
1720
1829
 
1721
1830
  # Return ontology levels from profile terms
1722
1831
  # ===== Returns
@@ -1737,6 +1846,83 @@ class Ontology
1737
1846
  return levels_filtered
1738
1847
  end
1739
1848
 
1849
+ def get_profile_ontology_distribution_tables
1850
+ cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
1851
+ uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
1852
+ hpo_ontology_levels = get_ontology_levels
1853
+ total_ontology_terms = hpo_ontology_levels.values.flatten.length
1854
+ total_cohort_terms = cohort_ontology_levels.values.flatten.length
1855
+ total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
1856
+
1857
+ ontology_levels = []
1858
+ distribution_percentage = []
1859
+ hpo_ontology_levels.each do |level, terms|
1860
+ cohort_terms = cohort_ontology_levels[level]
1861
+ uniq_cohort_terms = uniq_cohort_ontology_levels[level]
1862
+ if cohort_terms.nil? || uniq_cohort_terms.nil?
1863
+ num = 0
1864
+ u_num = 0
1865
+ else
1866
+ num = cohort_terms.length
1867
+ u_num = uniq_cohort_terms.length
1868
+ end
1869
+ ontology_levels << [level, terms.length, num]
1870
+ distribution_percentage << [
1871
+ level,
1872
+ (terms.length.fdiv(total_ontology_terms)*100).round(3),
1873
+ (num.fdiv(total_cohort_terms)*100).round(3),
1874
+ (u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
1875
+ ]
1876
+ end
1877
+ ontology_levels.sort! { |x,y| x.first <=> y.first }
1878
+ distribution_percentage.sort! { |x,y| x.first <=> y.first }
1879
+ return ontology_levels, distribution_percentage
1880
+ end
1881
+
1882
+ def get_dataset_specifity_index(mode)
1883
+ ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
1884
+ if mode == 'uniq'
1885
+ observed_distribution = 3
1886
+ elsif mode == 'weigthed'
1887
+ observed_distribution = 2
1888
+ end
1889
+ max_terms = distribution_percentage.map{|row| row[1]}.max
1890
+ maxL = nil
1891
+ distribution_percentage.each do |level_info|
1892
+ maxL = level_info.first if level_info[1] == max_terms
1893
+ end
1894
+ diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
1895
+ diffL.select!{|dL| dL.last > 0}
1896
+ lowSection = diffL.select{|dL| dL.first <= maxL}
1897
+ highSection = diffL.select{|dL| dL.first > maxL}
1898
+ dsi = nil
1899
+ if highSection.empty?
1900
+ dsi = 0
1901
+ else
1902
+ accumulated_weigth = 0
1903
+ accumulated_weigthed_diffL = 0
1904
+ hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
1905
+ lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
1906
+ dsi = hss.fdiv(lss)
1907
+ end
1908
+ return dsi
1909
+ end
1910
+
1911
+ def get_weigthed_level_contribution(section, maxL, nLevels)
1912
+ accumulated_weigthed_diffL = 0
1913
+ section.each do |level, diff|
1914
+ weightL = maxL - level
1915
+ if weightL >= 0
1916
+ weightL += 1
1917
+ else
1918
+ weightL = weightL.abs
1919
+ end
1920
+ accumulated_weigthed_diffL += diff * weightL
1921
+ end
1922
+ weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
1923
+ return weigthed_contribution
1924
+ end
1925
+
1740
1926
 
1741
1927
  # Calculate profiles dictionary with Key= Term; Value = Profiles
1742
1928
  def calc_profiles_dictionary
@@ -1808,17 +1994,66 @@ class Ontology
1808
1994
  end
1809
1995
  end
1810
1996
  if expand
1811
- relations.each do |k,v|
1812
- if @items.keys.include?(k)
1813
- @items[k] = (@items[k] + v).uniq
1814
- else
1815
- @items[k] = v
1816
- end
1817
- end
1997
+ @items = self.concatItems(@items,relations)
1998
+ # relations.each do |k,v| # MUST UPDATE THIS USING A CONCAT SPECIFIC FUNCTION
1999
+ # if @items.keys.include?(k)
2000
+ # if v.kind_of?(Array)
2001
+ # @items[k] = (@items[k] + v).uniq
2002
+ # elsif v.kind_of?(Hash)
2003
+ # @items.merge!(relations) do |k, oldV, newV|
2004
+ # if oldV.kind_of?(Array)
2005
+ # return (oldV + newV).uniq
2006
+ # else
2007
+ # oldV = [oldV,newV]
2008
+ # end
2009
+ # end
2010
+ # elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
2011
+ # @items[k] = (@items[k] + [v]).uniq
2012
+ # else
2013
+ # @items[k] = [@items[k],v]
2014
+ # end
2015
+ # else
2016
+ # @items[k] = v
2017
+ # end
2018
+ # end
1818
2019
  else
1819
2020
  @items.merge!(relations)
1820
2021
  end
1821
- end
2022
+ end
2023
+
2024
+ # Internal function to concat two elements.
2025
+ # ===== Parameters
2026
+ # +itemA+:: item to be concatenated
2027
+ # +itemB+:: item to be concatenated
2028
+ # ===== Returns
2029
+ # Concatenated objects
2030
+ def concatItems(itemA,itemB)
2031
+ # A is Array :: RETURN ARRAY
2032
+ # A_array : B_array
2033
+ # A_array : B_hash => NOT ALLOWED
2034
+ # A_array : B_single => NOT ALLOWED
2035
+ # A is Hash :: RETURN HASH
2036
+ # A_hash : B_array => NOT ALLOWED
2037
+ # A_hash : B_hash
2038
+ # A_hash : B_single => NOT ALLOWED
2039
+ # A is single element => RETURN ARRAY
2040
+ # A_single : B_array
2041
+ # A_single : B_hash => NOT ALLOWED
2042
+ # A_single : B_single
2043
+ concatenated = nil
2044
+ if itemA.kind_of?(Array) && itemB.kind_of?(Array)
2045
+ concatenated = (itemA + itemB).uniq
2046
+ elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
2047
+ concatenated = itemA.merge(itemB) do |k, oldV, newV|
2048
+ self.concatItems(oldV,newV)
2049
+ end
2050
+ elsif itemB.kind_of?(Array)
2051
+ concatenated = ([itemA] + itemB).uniq
2052
+ elsif ![Array, Hash].include?(itemB.class)
2053
+ concatenated = [itemA,itemB].uniq
2054
+ end
2055
+ return concatenated
2056
+ end
1822
2057
 
1823
2058
 
1824
2059
  # Assign a dictionary already calculated as a items set.
@@ -1826,7 +2061,7 @@ class Ontology
1826
2061
  # +dictID+:: dictionary ID to be stored (:byTerm will be used)
1827
2062
  def set_items_from_dict(dictID, remove_old_relations = false)
1828
2063
  @items = {} if remove_old_relations
1829
- if(@dicts.keys.include?(dictID))
2064
+ if !@dicts[dictID].nil?
1830
2065
  @items.merge(@dicts[dictID][:byTerm])
1831
2066
  else
1832
2067
  warn('Specified ID is not calculated. Dict will not be added as a items set')
@@ -1875,7 +2110,7 @@ class Ontology
1875
2110
  curr_keys.map do |term_expand|
1876
2111
  to_infer = []
1877
2112
  # Obtain childs
1878
- childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
2113
+ childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
1879
2114
  # Expand
1880
2115
  if childs.length > 0 && minimum_childs == 1 # Special case
1881
2116
  to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
@@ -1931,40 +2166,172 @@ class Ontology
1931
2166
  end
1932
2167
 
1933
2168
 
2169
+ # Return direct ancestors/descendants of a given term
2170
+ # ===== Parameters
2171
+ # +term+:: which are requested
2172
+ # +relation+:: can be :ancestor or :descendant
2173
+ # +remove_alternatives+:: if true, alternatives will be removed
2174
+ # ===== Returns
2175
+ # Direct ancestors/descendants of given term or nil if any error occurs
2176
+ def get_direct_related(term, relation, remove_alternatives: false)
2177
+ if @dicts[:is_a].nil?
2178
+ warn("Hierarchy dictionary is not already calculated. Returning nil")
2179
+ return nil
2180
+ end
2181
+ target = nil
2182
+ case relation
2183
+ when :ancestor
2184
+ target = :byTerm
2185
+ when :descendant
2186
+ target = :byValue
2187
+ else
2188
+ warn('Relation type not allowed. Returning nil')
2189
+ end
2190
+ return nil if target.nil?
2191
+ query = @dicts[:is_a][target][term]
2192
+ return query if query.nil?
2193
+ query, _ = remove_alternatives_from_profile(query) if remove_alternatives
2194
+ return query
2195
+ end
2196
+
2197
+
2198
+ # Return direct ancestors of a given term
2199
+ # ===== Parameters
2200
+ # +term+:: which ancestors are requested
2201
+ # +remove_alternatives+:: if true, alternatives will be removed
2202
+ # ===== Returns
2203
+ # Direct ancestors of given term or nil if any error occurs
2204
+ def get_direct_ancentors(term, remove_alternatives: false)
2205
+ return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
2206
+ end
2207
+
2208
+ # Return direct descendants of a given term
2209
+ # ===== Parameters
2210
+ # +term+:: which descendants are requested
2211
+ # +remove_alternatives+:: if true, alternatives will be removed
2212
+ # ===== Returns
2213
+ # Direct descendants of given term or nil if any error occurs
2214
+ def get_direct_descendants(term, remove_alternatives: false)
2215
+ return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
2216
+ end
2217
+
2218
+
2219
+
2220
+ #============================================================================
2221
+ #============================================================================
1934
2222
 
1935
2223
  # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1936
2224
  # ===== Parameters
1937
2225
  # ++::
1938
2226
  # ===== Returns
1939
2227
  # ...
1940
- def compute_relations_to_items(external_item_list, mode, thresold)
2228
+ def compute_relations_to_items(external_item_list, total_items, mode, thresold)
2229
+ terms_levels = list_terms_per_level_from_items
2230
+ #puts terms_levels.inspect.yellow
2231
+ connect_familiars!(terms_levels)
2232
+ #puts terms_levels.inspect.blue
2233
+ item_list_with_transf_parental = get_item_list_parental(terms_levels)
2234
+ results = []
2235
+ if mode == :elim
2236
+ results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
2237
+ elsif mode == :weight
2238
+ results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
2239
+ end
2240
+ return results
2241
+ end
2242
+
2243
+ def get_item_list_parental(terms_levels)
2244
+ transfered_list = {}
2245
+ parent_dict = @dicts[:is_a][:byTerm]
2246
+ levels = terms_levels.keys.sort
2247
+ while levels.length > 1
2248
+ level = levels.pop
2249
+ terms_levels[level].each do |term|
2250
+ parents = parent_dict[term]
2251
+ if parents.nil?
2252
+ next
2253
+ elsif parents.length == 1
2254
+ parent = parents.first
2255
+ else
2256
+ parent = (parents | terms_levels[level - 1]).first
2257
+ end
2258
+ term_it = @items[term]
2259
+ parent_it = @items[parent]
2260
+ curr_it = transfered_list[term]
2261
+ parent_all_items = merge_groups([term_it, parent_it, curr_it])
2262
+ transfered_list[parent] = parent_all_items if !parent_all_items.empty?
2263
+ term_all_items = merge_groups([term_it, curr_it])
2264
+ transfered_list[term] = term_all_items if !term_all_items.empty?
2265
+ end
2266
+ end
2267
+ terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
2268
+ transfered_list[term] = @items[term] if transfered_list[term].nil?
2269
+ end
2270
+ return transfered_list
2271
+ end
2272
+
2273
+ def merge_groups(groups)
2274
+ return groups.compact.inject([]){|it, a| it | a}
2275
+ end
2276
+
2277
+ def list_terms_per_level_from_items
2278
+ terms_levels = {}
2279
+ @items.each do |term, items|
2280
+ level = self.get_term_level(term)
2281
+ query = terms_levels[level]
2282
+ if query.nil?
2283
+ terms_levels[level] = [term]
2284
+ else
2285
+ query << term
2286
+ end
2287
+ end
2288
+ return terms_levels
2289
+ end
2290
+
2291
+ def connect_familiars!(terms_levels)
2292
+ levels = terms_levels.keys.sort
2293
+ while levels.length > 1 # Process when current level has a parental level
2294
+ level = levels.pop
2295
+ parental_level = level - 1
2296
+ parental_terms = terms_levels[parental_level]
2297
+ if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
2298
+ parental_terms = [] # Initialize required parental level
2299
+ terms_levels[parental_level] = parental_terms
2300
+ levels << parental_level
2301
+ end
2302
+ terms_levels[level].each do |term|
2303
+ path_info = @term_paths[term]
2304
+ shortest_path_length = path_info[:shortest_path]
2305
+ path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
2306
+ parental = path[1] # the first elements is the term itself
2307
+ parental_terms << parental if !parental_terms.include?(parental)
2308
+ end
2309
+ end
2310
+ end
2311
+
2312
+ def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
1941
2313
  results = []
1942
2314
  penalized_terms = {}
1943
- # terms_levels = get_terms_levels(@items_relations.keys)
1944
- terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
1945
- terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
1946
- terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
1947
2315
  levels = terms_levels.keys.sort
1948
2316
  levels.reverse_each do |level|
1949
2317
  terms_levels[level].each do |term|
1950
- associated_items = @items_relations[term]
1951
- if mode == :elim
1952
- items_to_remove = penalized_terms[term]
1953
- items_to_remove = [] if items_to_remove.nil?
1954
- pval = get_fisher_exact_test(
1955
- external_item_list - items_to_remove,
1956
- associated_items - items_to_remove,
1957
- ((associated_items | external_item_list) - items_to_remove).length
1958
- )
1959
- if pval <= thresold
1960
- parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
1961
- parents.each do |prnt|
1962
- query = penalized_terms[prnt]
1963
- if query.nil?
1964
- penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
1965
- else
1966
- query.concat(@items_relations[term])
1967
- end
2318
+ associated_items = item_list[term]
2319
+ items_to_remove = penalized_terms[term]
2320
+ items_to_remove = [] if items_to_remove.nil?
2321
+ pval = get_fisher_exact_test(
2322
+ external_item_list - items_to_remove,
2323
+ associated_items - items_to_remove,
2324
+ #((associated_items | external_item_list) - items_to_remove).length
2325
+ total_items
2326
+ )
2327
+ if pval <= thresold
2328
+ parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
2329
+ parents.each do |prnt|
2330
+ query = penalized_terms[prnt]
2331
+ if query.nil?
2332
+ penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
2333
+ else
2334
+ query.concat(item_list[term])
1968
2335
  end
1969
2336
  end
1970
2337
  end
@@ -1974,6 +2341,81 @@ class Ontology
1974
2341
  return results
1975
2342
  end
1976
2343
 
2344
+ def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
2345
+ pvals = {}
2346
+ item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
2347
+ levels = terms_levels.keys.sort
2348
+ levels.reverse_each do |level|
2349
+ terms_levels[level].each do |term|
2350
+ associated_items = item_list[term]
2351
+ #initialize observed items in item_weigths_per_term list
2352
+ add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
2353
+ children = @dicts[:is_a][:byValue][term]
2354
+ if children.nil?
2355
+ children = []
2356
+ else
2357
+ children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
2358
+ end
2359
+ computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
2360
+ end
2361
+ end
2362
+ return pvals.to_a
2363
+ end
2364
+
2365
+ def add_items_to_weigthed_list(term, associated_items, weigthed_list)
2366
+ term_weigthing = weigthed_list[term]
2367
+ associated_items.each{|ai| term_weigthing[ai] = 1}
2368
+ weigthed_list[term] = term_weigthing
2369
+ end
2370
+
2371
+ def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
2372
+ #puts term.to_s.red
2373
+ #puts @term_paths[term].inspect
2374
+ #puts @dicts[:is_a][:byValue][term].inspect.light_blue
2375
+ associated_items = item_weigths_per_term[term].keys
2376
+ pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
2377
+ 'two_sided', item_weigths_per_term[term], true)
2378
+ pvals[term] = pval
2379
+ if children.length > 0
2380
+ rates = {}
2381
+ sig_child = 0
2382
+ children.each do |child|
2383
+ ratio = sigRatio(pvals[child], pval)
2384
+ rates[child] = ratio
2385
+ sig_child += 1 if ratio >= 1
2386
+ end
2387
+ if sig_child == 0 # CASE 1
2388
+ children.each do |child|
2389
+ current_ratio = rates[child]
2390
+ query_child = item_weigths_per_term[child]
2391
+ query_child.transform_values!{|weight| weight * current_ratio}
2392
+ pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
2393
+ 'two_sided', item_weigths_per_term[child], true)
2394
+ end
2395
+ else
2396
+ ancs = get_ancestors(term, filter_alternatives = true)
2397
+ ancs << term
2398
+ rates.each do |ch, ratio|# CASE 2
2399
+ if ratio >= 1 # The child is better than parent
2400
+ ancs.each do |anc|
2401
+ query_anc = item_weigths_per_term[anc]
2402
+ associated_items.each do |item|
2403
+ query_anc[item] /= ratio # /= --> query_anc[item]/ratio
2404
+ end
2405
+ end
2406
+ end
2407
+ end
2408
+ computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
2409
+ end
2410
+ end
2411
+ end
2412
+
2413
+ def sigRatio(pvalA, pvalB)
2414
+ return Math.log(pvalA)/Math.log(pvalB)
2415
+ end
2416
+
2417
+ #============================================================================
2418
+ #============================================================================
1977
2419
 
1978
2420
  # Check if a given ID is a removable (blacklist) term.
1979
2421
  # +DEPRECATED+ use is_removable? instead
@@ -1,3 +1,3 @@
1
1
  module Semtools
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.6"
3
3
  end
data/semtools.gemspec CHANGED
@@ -34,4 +34,5 @@ Gem::Specification.new do |spec|
34
34
 
35
35
  spec.add_development_dependency "rake"
36
36
  spec.add_development_dependency "rspec"
37
+ spec.add_runtime_dependency 'colorize', '>= 0.7.3'
37
38
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: semtools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - seoanezonjic
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-03-22 00:00:00.000000000 Z
12
+ date: 2021-05-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: text
@@ -53,6 +53,20 @@ dependencies:
53
53
  - - ">="
54
54
  - !ruby/object:Gem::Version
55
55
  version: '0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: colorize
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 0.7.3
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 0.7.3
56
70
  description: This gem allows to perform ontology based operations and calculation
57
71
  of Semantic similarity and information coefficient using different implementations.
58
72
  email: