semtools 0.1.3 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e68630d42a4faf01dc15fdfa9f1acd64425ef1396ed6f9ce0a8d76319922ba06
4
- data.tar.gz: 952d908af5370031df0f19c98ab69fbb59b51825f050b69714f4494e15f77f77
3
+ metadata.gz: 83746b4834f16f9bffc404c578be30e427bf90c7f43ba1f55bd04d94a29186c5
4
+ data.tar.gz: 25f0bd67c733d4289f2e790bc857886a184dd95677beaf378da9074957660e69
5
5
  SHA512:
6
- metadata.gz: 85792433d82f824297df87cb0927b24116425ddb2a72a3e2f461748e014aa27f4efc8f73fcd7d1e6c423acd7487b77d21c2a8c0b7b0f8530030f6246ad62ad64
7
- data.tar.gz: 2d0e0953f19d8c2cad2cc85a0c6d8c1cb9bf95f4dd1ee2d75aebcf15bdd3929d2938ede6544ed3f145ac5a8804b97af64f50a859ae7ecf8164f0ed4f07208fb2
6
+ metadata.gz: 602fd8d61f9e9f34c2de957dd4d311a510413eea2bb0d1794b4d2da6f4e8959d3919dd8e9ea4a0d9f5d4e962443b7f50675e075d385830aeb9ef08ecb38d3fe2
7
+ data.tar.gz: 5b9f66a1fef9c3296e5fe203330e3575e4fecf0f363fc8b884abd56e8fbcb1bd5dac3563792a4a5da908bcee409e2d36596751123aa2012d5cc9a8a5ff3c7796
@@ -1,7 +1,8 @@
1
1
  # TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
2
2
  #to cmpute fisher exact test
3
3
  #Fisher => http://www.biostathandbook.com/fishers.html
4
- def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
4
+ def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
5
+ #puts '-', listA.inspect, listB.inspect, '-'
5
6
  listA_listB = listA & listB
6
7
  listA_nolistB = listA - listB
7
8
  nolistA_listB = listB - listA
@@ -16,9 +17,16 @@ def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', w
16
17
  listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
17
18
  listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
18
19
  nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
19
- nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
20
- all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
20
+
21
+ if partial_weigths
22
+ nolistA_nolistB_count = all_elements_count - (listA | listB).length
23
+ all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
24
+ else
25
+ nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
26
+ all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
27
+ end
21
28
  end
29
+ #puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
22
30
  if tail == 'two_sided'
23
31
  accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
24
32
  elsif tail == 'less'
@@ -1,4 +1,5 @@
1
1
  require 'json'
2
+ require 'colorize'
2
3
 
3
4
 
4
5
  class Ontology
@@ -38,7 +39,7 @@ class Ontology
38
39
  # => @removable_terms :: array of terms to not be considered
39
40
  # => @term_paths :: metainfo about parental paths of each term
40
41
 
41
- @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:alt_id,:replaced_by,:consider]}
42
+ @@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
42
43
  @@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
43
44
  @@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
44
45
  @@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
@@ -52,10 +53,11 @@ class Ontology
52
53
  # Instantiate a OBO_Handler object
53
54
  # ===== Parameters
54
55
  # +file+:: with info to be loaded (.obo ; .json)
55
- # +load_file+:: activate load process automatically (only for .obo)
56
+ # +load_file+:: activate load process automatically
56
57
  # +removable_terms+: term to be removed from calcs
57
58
  # +build+: flag to launch metainfo calculation
58
- def initialize(file: nil, load_file: false, removable_terms: [], build: true)
59
+ # +file_format+: force format type despite file extension. Can be :obo or :json
60
+ def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
59
61
  # Initialize object variables
60
62
  @header = nil
61
63
  @stanzas = {terms: {}, typedefs: {}, instances: {}}
@@ -74,9 +76,20 @@ class Ontology
74
76
  @items = {}
75
77
  @removable_terms = []
76
78
  @term_paths = {}
77
- # Load if proceeds
78
79
  add_removable_terms(removable_terms) if !removable_terms.empty?
79
- load(file, build: build) if load_file
80
+ load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
81
+ # Load if proceeds
82
+ if load_file
83
+ fformat = file_format
84
+ fformat = File.extname(file) if fformat.nil? && !file.nil?
85
+ if fformat == :obo || fformat == ".obo"
86
+ load(file, build: build)
87
+ elsif fformat == :json || fformat == ".json"
88
+ self.read(file, build: build)
89
+ elsif !fformat.nil?
90
+ warn 'Format not allowed. Loading process will not be performed'
91
+ end
92
+ end
80
93
  end
81
94
 
82
95
 
@@ -413,31 +426,54 @@ class Ontology
413
426
  # +bidirectional+:: calculate bidirectional similitude. Default: false
414
427
  # ===== Return
415
428
  # similitude calculated
416
- def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
429
+ def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
417
430
  # Check
418
431
  raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
419
432
  raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
420
433
  micasA = []
421
434
  # Compare A -> B
422
435
  termsA.each do |tA|
423
- micas = termsB.map{|tB| self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)}
424
- # Remove special cases
425
- [false,nil].each do |err_value| micas.delete(err_value) end
426
- # Obtain maximum value
427
- micasA << micas.max if micas.length > 0
428
- micasA << 0 if micas.length <= 0
429
- end
430
- means_sim = micasA.inject{ |sum, el| sum + el }.to_f / micasA.size
436
+ micas = []
437
+ termsB.each do |tB|
438
+ if store_mica
439
+ value = @mica_index.dig(tA, tB)
440
+ else
441
+ value = nil
442
+ end
443
+ if value.nil?
444
+ value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
445
+ if store_mica
446
+ value = true if value.nil? # We use true to save that the operation was made but there is not mica value
447
+ add2nestHash(@mica_index, tA, tB, value)
448
+ end
449
+ end
450
+ micas << value if value.class == Float
451
+ end
452
+ if !micas.empty?
453
+ micasA << micas.max # Obtain maximum value
454
+ else
455
+ micasA << 0
456
+ end
457
+ end
458
+ means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
431
459
  # Compare B -> A
432
460
  if bidirectional
433
461
  means_simA = means_sim * micasA.size
434
- means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
435
- means_sim = (means_simA + means_simB) / (termsA.size + termsB.size)
462
+ means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
463
+ means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
436
464
  end
437
465
  # Return
438
466
  return means_sim
439
467
  end
440
468
 
469
+ def add2nestHash(h, key1, key2, val)
470
+ query1 = h[key1]
471
+ if query1.nil?
472
+ h[key1] = {key2 => val}
473
+ else
474
+ query1[key2] = val
475
+ end
476
+ end
441
477
 
442
478
  # Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
443
479
  # ===== Parameters
@@ -462,12 +498,13 @@ class Ontology
462
498
  main_profiles = @profiles
463
499
  end
464
500
  # Compare
501
+ @mica_index = {}
465
502
  while !main_ids.empty?
466
503
  curr_id = main_ids.shift
467
504
  current_profile = main_profiles[curr_id]
468
505
  comp_ids.each do |id|
469
506
  profile = comp_profiles[id]
470
- value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
507
+ value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
471
508
  query = profiles_similarity[curr_id]
472
509
  if query.nil?
473
510
  profiles_similarity[curr_id] = {id => value}
@@ -485,20 +522,23 @@ class Ontology
485
522
  # +alt_tag+:: tag used to expand alternative IDs
486
523
  # ===== Returns
487
524
  # true if process ends without errors and false in other cases
488
- def get_index_alternatives(alt_tag: @@basic_tags[:alternative][0])
525
+ def get_index_alternatives(alt_tag: @@basic_tags[:alternative].last)
489
526
  # Check input
490
527
  raise('stanzas terms empty') if @stanzas[:terms].empty?
491
528
  # Take all alternative IDs
492
529
  alt_ids2add = {}
493
530
  @stanzas[:terms].each do |id, tags|
494
- alt_ids = tags[alt_tag]
495
- if !alt_ids.nil?
496
- alt_ids = alt_ids - @removable_terms
497
- # Update info
498
- alt_ids.each do |alt_term|
499
- @alternatives_index[alt_term] = id
500
- alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
501
- @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
531
+ if id == tags[:id] # Avoid simulated alternative terms
532
+ # id = tags[:id] # Take always real ID in case of alternative terms simulted
533
+ alt_ids = tags[alt_tag]
534
+ if !alt_ids.nil?
535
+ alt_ids = alt_ids - @removable_terms - [id]
536
+ # Update info
537
+ alt_ids.each do |alt_term|
538
+ @alternatives_index[alt_term] = id
539
+ alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
540
+ @ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
541
+ end
502
542
  end
503
543
  end
504
544
  end
@@ -510,10 +550,11 @@ class Ontology
510
550
  # ===== Returns
511
551
  # true if eprocess ends without errors and false in other cases
512
552
  def build_index()
513
- self.get_index_alternatives
514
553
  self.get_index_obsoletes
554
+ self.get_index_alternatives
515
555
  self.get_index_child_parent_relations
516
556
  @alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
557
+ ## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
517
558
  @alternatives_index.compact!
518
559
  @obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
519
560
  @obsoletes_index.compact!
@@ -536,8 +577,6 @@ class Ontology
536
577
  if @ancestors_index.empty?
537
578
  warn('ancestors_index object is empty')
538
579
  else
539
- # Prepare useful variables
540
- alternative_terms = @alternatives_index.keys
541
580
  # Per each term, add frequencies
542
581
  @stanzas[:terms].each do |id, tags|
543
582
  if @alternatives_index.include?(id)
@@ -556,8 +595,8 @@ class Ontology
556
595
  @meta[id] = query
557
596
  end
558
597
  # Store metadata
559
- query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !alternative_terms.include?(anc)}.to_f : 0.0
560
- query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !alternative_terms.include?(desc)}.to_f : 0.0
598
+ query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
599
+ query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
561
600
  query[:struct_freq] = query[:descendants] + 1.0
562
601
  # Update maximums
563
602
  @max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
@@ -582,6 +621,7 @@ class Ontology
582
621
  # Check obsoletes
583
622
  @stanzas[:terms].each do |id, term_tags|
584
623
  next if term_tags.nil?
624
+ next if self.is_alternative?(id)
585
625
  query = term_tags[obs_tag]
586
626
  if !query.nil? && query == 'true' # Obsolete tag presence
587
627
  next if !@obsoletes_index[id].nil? # Already stored
@@ -633,10 +673,10 @@ class Ontology
633
673
  end
634
674
  end
635
675
  # Store alternatives
636
- @alternatives_index.each do |id,alt|
637
- anc[id] = anc[alt] if anc.include?(alt)
638
- des[id] = des[alt] if des.include?(alt)
639
- end
676
+ # @alternatives_index.each do |id,alt|
677
+ # anc[id] = anc[alt] if anc.include?(alt)
678
+ # des[id] = des[alt] if des.include?(alt)
679
+ # end
640
680
  # Check structure
641
681
  if ![:atomic,:sparse].include? structType
642
682
  structType = structType == :circular ? :circular : :hierarchical
@@ -704,12 +744,14 @@ class Ontology
704
744
  # the IC calculated
705
745
  def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
706
746
  term = termRaw.to_sym
747
+ curr_ics = @ics[type]
707
748
  # Check
708
749
  raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
709
750
  # Check if it's already calculated
710
- return @ics[type][term] if (@ics[type].include? term) && !force
751
+ return curr_ics[term] if (curr_ics.include? term) && !force
711
752
  # Calculate
712
753
  ic = - 1
754
+ term_meta = @meta[term]
713
755
  case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
714
756
  ###########################################
715
757
  #### STRUCTURE BASED METRICS
@@ -726,10 +768,10 @@ class Ontology
726
768
  ###########################################
727
769
  when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
728
770
  # -log(Freq(x) / Max_Freq)
729
- ic = -Math.log10(@meta[term][:struct_freq].fdiv(@max_freqs[:struct_freq]))
771
+ ic = -Math.log10(term_meta[:struct_freq].fdiv(@max_freqs[:struct_freq]))
730
772
  when :resnik_observed
731
773
  # -log(Freq(x) / Max_Freq)
732
- ic = -Math.log10(@meta[term][:observed_freq].fdiv(@max_freqs[:observed_freq]))
774
+ ic = -Math.log10(term_meta[:observed_freq].fdiv(@max_freqs[:observed_freq]))
733
775
  # Lin
734
776
  # Jiang & Conrath
735
777
 
@@ -745,17 +787,17 @@ class Ontology
745
787
  ###########################################
746
788
  when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
747
789
  # 1 - ( log(hypo(x) + 1) / log(max_nodes) )
748
- ic = 1 - Math.log10(@meta[term][:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
790
+ ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
749
791
  if :zhou # New Model of Semantic Similarity Measuring in Wordnet
750
792
  # k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
751
793
  @ics[:seco][term] = ic # Special store
752
- ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(@meta[term][:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
794
+ ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(term_meta[:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
753
795
  end
754
796
  when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
755
- ic = -Math.log10((@meta[term][:descendants].fdiv(@meta[term][:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
797
+ ic = -Math.log10((term_meta[:descendants].fdiv(term_meta[:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
756
798
  # Knappe
757
799
  end
758
- @ics[type][term] = ic
800
+ curr_ics[term] = ic
759
801
  return ic
760
802
  end
761
803
 
@@ -788,8 +830,8 @@ class Ontology
788
830
  # ===== Returns
789
831
  # the IC of the MICA(termA,termB)
790
832
  def get_ICMICA(termA, termB, ic_type = :resnik)
791
- mica = self.get_MICA(termA, termB, ic_type)
792
- return mica.first.nil? ? nil : mica.last
833
+ term, ic = self.get_MICA(termA, termB, ic_type)
834
+ return term.nil? ? nil : ic
793
835
  end
794
836
 
795
837
 
@@ -812,19 +854,12 @@ class Ontology
812
854
  # Obtain ancestors (include itselfs too)
813
855
  anc_A = self.get_ancestors(termA)
814
856
  anc_B = self.get_ancestors(termB)
815
-
816
857
  if !(anc_A.empty? && anc_B.empty?)
817
858
  anc_A << termA
818
859
  anc_B << termB
819
- # Find shared ancestors
820
- shared_ancestors = anc_A & anc_B
821
- # Find MICA
822
- if shared_ancestors.length > 0
823
- shared_ancestors.each do |anc|
824
- ic = self.get_IC(anc, type: ic_type)
825
- # Check
826
- mica = [anc,ic] if ic > mica[1]
827
- end
860
+ (anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
861
+ ic = self.get_IC(anc, type: ic_type)
862
+ mica = [anc,ic] if ic > mica[1]
828
863
  end
829
864
  end
830
865
  end
@@ -844,9 +879,8 @@ class Ontology
844
879
  # Check
845
880
  raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
846
881
  sim = nil
847
- # Launch comparissons
848
- sim_res = get_ICMICA(termA, termB, ic_type)
849
- if !sim_res.nil?
882
+ mica, sim_res = get_MICA(termA, termB, ic_type)
883
+ if !mica.nil?
850
884
  case type
851
885
  when :resnik
852
886
  sim = sim_res
@@ -922,6 +956,16 @@ class Ontology
922
956
  jsonFile = File.open(file)
923
957
  jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
924
958
  # Pre-process (Symbolize some hashs values)
959
+ if !jsonInfo[:header].nil?
960
+ aux = jsonInfo[:header].map do |entry,info|
961
+ if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
962
+ [entry,info.map{|item| item.to_sym}]
963
+ else
964
+ [entry,info]
965
+ end
966
+ end
967
+ jsonInfo[:header] = aux.to_h
968
+ end
925
969
  jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
926
970
  jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
927
971
  jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
@@ -1106,7 +1150,7 @@ class Ontology
1106
1150
  if checked.nil?
1107
1151
  t
1108
1152
  else
1109
- byValue[checked] = byValue.delete(t) if checked != t && !byValue.keys.include?(checked) # Update in byValue
1153
+ byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
1110
1154
  checked
1111
1155
  end
1112
1156
  end
@@ -1134,7 +1178,8 @@ class Ontology
1134
1178
  else
1135
1179
  aux = self.extract_id(referenceValue)
1136
1180
  end
1137
- referenceValue = aux if !aux.nil?
1181
+ aux.compact! unless aux.nil?
1182
+ referenceValue = aux unless aux.nil?
1138
1183
  end
1139
1184
  referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
1140
1185
  byTerm[term] = referenceValue + (values - referenceValue)
@@ -1525,6 +1570,7 @@ class Ontology
1525
1570
  # ===== Returns
1526
1571
  # cleaned profile
1527
1572
  def clean_profile(profile, remove_alternatives: true)
1573
+ warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
1528
1574
  terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
1529
1575
  if remove_alternatives
1530
1576
  terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
@@ -1534,6 +1580,43 @@ class Ontology
1534
1580
  return terms_without_ancestors_and_alternatices
1535
1581
  end
1536
1582
 
1583
+ def clean_profile_hard(profile)
1584
+ profile, _ = check_ids(profile)
1585
+ profile = profile.select{|t| !is_obsolete?(t)}
1586
+ profile = clean_profile(profile.uniq)
1587
+ return profile
1588
+ end
1589
+
1590
+ # Remove terms from a given profile using hierarchical info and scores set given
1591
+ # ===== Parameters
1592
+ # +profile+:: profile to be cleaned
1593
+ # +scores+:: hash with terms by keys and numerical values (scores)
1594
+ # +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
1595
+ # +remove_without_score+:: if true, terms without score will be removed. Default: true
1596
+ # ===== Returns
1597
+ # cleaned profile
1598
+ def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
1599
+ scores = scores.sort_by{|term,score| score}.to_h
1600
+ keep = profile.map do |term|
1601
+ if scores.include?(term)
1602
+ parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
1603
+ targetable = parentals.select{|parent| profile.include?(parent)}
1604
+ if targetable.empty?
1605
+ term
1606
+ else
1607
+ targetable << term
1608
+ targets = scores.select{|term,score| targetable.include?(term)}.to_h
1609
+ byMax ? targets.keys.last : targets.keys.first
1610
+ end
1611
+ elsif remove_without_score
1612
+ nil
1613
+ else
1614
+ term
1615
+ end
1616
+ end
1617
+ return keep.compact.uniq
1618
+ end
1619
+
1537
1620
 
1538
1621
  # Remove alternatives (if official term is present) and ancestors terms of stored profiles
1539
1622
  # ===== Parameters
@@ -1635,44 +1718,45 @@ class Ontology
1635
1718
 
1636
1719
  # Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
1637
1720
  # Also calculates paths metadata and stores into @term_paths
1638
- def calc_term_paths
1639
- self.calc_ancestors_dictionary if !@dicts.keys.include?(:is_a) # Calculate direct parentals dictionary if it's not already calculated
1640
- visited_terms = []
1721
+ def calc_term_paths(only_main_terms=false)
1722
+ self.calc_ancestors_dictionary if @dicts[:is_a].nil? # Calculate direct parentals dictionary if it's not already calculated
1723
+ visited_terms = {} # PEDRO: To keep track of visited data, hash accesions are fast than array includes. I don't understant why use this variable instead of check @term_paths to see if the data is calculated
1641
1724
  @term_paths = {}
1642
1725
  if [:hierarchical, :sparse].include? @structureType
1643
- terms = @stanzas[:terms].keys
1644
- terms.each do |term|
1645
- if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
1726
+ @stanzas[:terms].each do |term, t_attributes|
1727
+ if !only_main_terms && (self.is_obsolete?(term) || self.is_alternative?(term)) # Special case (obsoletes)
1646
1728
  special_term = term
1647
1729
  term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
1648
1730
  @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1649
1731
  @term_paths[special_term] = @term_paths[term]
1650
- visited_terms << special_term
1732
+ visited_terms[special_term] = true
1651
1733
  end
1652
-
1653
1734
  if !visited_terms.include?(term)
1654
- @term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
1735
+ # PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
1736
+ path_attr = @term_paths[term]
1737
+ if path_attr.nil?
1738
+ path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
1739
+ @term_paths[term] = path_attr #save path data container
1740
+ end
1655
1741
  parentals = @dicts[:is_a][:byTerm][term]
1656
1742
  if parentals.nil?
1657
- @term_paths[term][:paths] << [term]
1743
+ path_attr[:paths] << [term]
1658
1744
  else
1659
1745
  parentals.each do |direct_parental|
1660
- if visited_terms.include? direct_parental # Use direct_parental already calculated paths
1661
- new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1662
- else # Calculate new paths
1663
- self.expand_path(direct_parental, visited_terms)
1664
- new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
1665
- end
1666
- new_paths.each{|path| @term_paths[term][:paths] << path}
1746
+ self.expand_path(direct_parental)
1747
+ new_paths = @term_paths[direct_parental][:paths]
1748
+ path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
1667
1749
  end
1668
- end
1669
- visited_terms << term
1750
+ end
1751
+ anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
1752
+ visited_terms[term] = true
1670
1753
  end
1671
1754
  # Update metadata
1672
- @term_paths[term][:total_paths] = @term_paths[term][:paths].length
1673
- paths_sizes = @term_paths[term][:paths].map{|path| path.length}
1674
- @term_paths[term][:largest_path] = paths_sizes.max
1675
- @term_paths[term][:shortest_path] = paths_sizes.min
1755
+ path_attr = @term_paths[term]
1756
+ path_attr[:total_paths] = path_attr[:paths].length
1757
+ paths_sizes = path_attr[:paths].map{|path| path.length}
1758
+ path_attr[:largest_path] = paths_sizes.max
1759
+ path_attr[:shortest_path] = paths_sizes.min
1676
1760
  end
1677
1761
  else
1678
1762
  warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
@@ -1684,20 +1768,25 @@ class Ontology
1684
1768
  # ===== Parameters
1685
1769
  # +curr_term+:: current visited term
1686
1770
  # +visited_terms+:: already expanded terms
1687
- def expand_path(curr_term, visited_terms)
1688
- if !visited_terms.include?(curr_term) # Not already expanded
1689
- @term_paths[curr_term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if @term_paths[curr_term].nil?
1771
+ def expand_path(curr_term)
1772
+ if !@term_paths.include?(curr_term)
1773
+ path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
1774
+ @term_paths[curr_term] = path_attr
1690
1775
  direct_parentals = @dicts[:is_a][:byTerm][curr_term]
1691
1776
  if direct_parentals.nil? # No parents :: End of recurrence
1692
- @term_paths[curr_term][:paths] << [curr_term]
1777
+ path_attr[:paths] << [curr_term]
1693
1778
  else # Expand and concat
1694
1779
  direct_parentals.each do |ancestor|
1695
- self.expand_path(ancestor,visited_terms) if !visited_terms.include?(ancestor)
1696
- new_paths = @term_paths[ancestor][:paths].map{|path| [curr_term, path].flatten}
1697
- new_paths.each{|path| @term_paths[curr_term][:paths] << path}
1780
+ path_attr_parental = @term_paths[ancestor]
1781
+ if path_attr_parental.nil? # Calculate new paths
1782
+ self.expand_path(ancestor)
1783
+ new_paths = @term_paths[ancestor][:paths]
1784
+ else # Use direct_parental paths already calculated
1785
+ new_paths = path_attr_parental[:paths]
1786
+ end
1787
+ path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
1698
1788
  end
1699
1789
  end
1700
- visited_terms << curr_term
1701
1790
  end
1702
1791
  end
1703
1792
 
@@ -1717,6 +1806,26 @@ class Ontology
1717
1806
  return @dicts[:level][:byValue][term]
1718
1807
  end
1719
1808
 
1809
+ # nil, term not found, [] term exists but not has parents
1810
+ def get_parental_path(term, which_path = :shortest_path, level = 0)
1811
+ path = nil
1812
+ path_attr = @term_paths[term]
1813
+ if !path_attr.nil?
1814
+ path_length = path_attr[which_path]
1815
+ all_paths = path_attr[:paths]
1816
+ if all_paths.empty?
1817
+ path = []
1818
+ else
1819
+ path = all_paths.select{|pt| pt.length == path_length}.first.clone
1820
+ if level > 0 # we want the term and his ascendants until a specific level
1821
+ n_parents = path_length - level
1822
+ path = path[0..n_parents]
1823
+ end
1824
+ path.shift # Discard the term itself
1825
+ end
1826
+ end
1827
+ return path
1828
+ end
1720
1829
 
1721
1830
  # Return ontology levels from profile terms
1722
1831
  # ===== Returns
@@ -1737,6 +1846,83 @@ class Ontology
1737
1846
  return levels_filtered
1738
1847
  end
1739
1848
 
1849
+ def get_profile_ontology_distribution_tables
1850
+ cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
1851
+ uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
1852
+ hpo_ontology_levels = get_ontology_levels
1853
+ total_ontology_terms = hpo_ontology_levels.values.flatten.length
1854
+ total_cohort_terms = cohort_ontology_levels.values.flatten.length
1855
+ total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
1856
+
1857
+ ontology_levels = []
1858
+ distribution_percentage = []
1859
+ hpo_ontology_levels.each do |level, terms|
1860
+ cohort_terms = cohort_ontology_levels[level]
1861
+ uniq_cohort_terms = uniq_cohort_ontology_levels[level]
1862
+ if cohort_terms.nil? || uniq_cohort_terms.nil?
1863
+ num = 0
1864
+ u_num = 0
1865
+ else
1866
+ num = cohort_terms.length
1867
+ u_num = uniq_cohort_terms.length
1868
+ end
1869
+ ontology_levels << [level, terms.length, num]
1870
+ distribution_percentage << [
1871
+ level,
1872
+ (terms.length.fdiv(total_ontology_terms)*100).round(3),
1873
+ (num.fdiv(total_cohort_terms)*100).round(3),
1874
+ (u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
1875
+ ]
1876
+ end
1877
+ ontology_levels.sort! { |x,y| x.first <=> y.first }
1878
+ distribution_percentage.sort! { |x,y| x.first <=> y.first }
1879
+ return ontology_levels, distribution_percentage
1880
+ end
1881
+
1882
+ def get_dataset_specifity_index(mode)
1883
+ ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
1884
+ if mode == 'uniq'
1885
+ observed_distribution = 3
1886
+ elsif mode == 'weigthed'
1887
+ observed_distribution = 2
1888
+ end
1889
+ max_terms = distribution_percentage.map{|row| row[1]}.max
1890
+ maxL = nil
1891
+ distribution_percentage.each do |level_info|
1892
+ maxL = level_info.first if level_info[1] == max_terms
1893
+ end
1894
+ diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
1895
+ diffL.select!{|dL| dL.last > 0}
1896
+ lowSection = diffL.select{|dL| dL.first <= maxL}
1897
+ highSection = diffL.select{|dL| dL.first > maxL}
1898
+ dsi = nil
1899
+ if highSection.empty?
1900
+ dsi = 0
1901
+ else
1902
+ accumulated_weigth = 0
1903
+ accumulated_weigthed_diffL = 0
1904
+ hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
1905
+ lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
1906
+ dsi = hss.fdiv(lss)
1907
+ end
1908
+ return dsi
1909
+ end
1910
+
1911
+ def get_weigthed_level_contribution(section, maxL, nLevels)
1912
+ accumulated_weigthed_diffL = 0
1913
+ section.each do |level, diff|
1914
+ weightL = maxL - level
1915
+ if weightL >= 0
1916
+ weightL += 1
1917
+ else
1918
+ weightL = weightL.abs
1919
+ end
1920
+ accumulated_weigthed_diffL += diff * weightL
1921
+ end
1922
+ weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
1923
+ return weigthed_contribution
1924
+ end
1925
+
1740
1926
 
1741
1927
  # Calculate profiles dictionary with Key= Term; Value = Profiles
1742
1928
  def calc_profiles_dictionary
@@ -1808,17 +1994,66 @@ class Ontology
1808
1994
  end
1809
1995
  end
1810
1996
  if expand
1811
- relations.each do |k,v|
1812
- if @items.keys.include?(k)
1813
- @items[k] = (@items[k] + v).uniq
1814
- else
1815
- @items[k] = v
1816
- end
1817
- end
1997
+ @items = self.concatItems(@items,relations)
1998
+ # relations.each do |k,v| # MUST UPDATE THIS USING A CONCAT SPECIFIC FUNCTION
1999
+ # if @items.keys.include?(k)
2000
+ # if v.kind_of?(Array)
2001
+ # @items[k] = (@items[k] + v).uniq
2002
+ # elsif v.kind_of?(Hash)
2003
+ # @items.merge!(relations) do |k, oldV, newV|
2004
+ # if oldV.kind_of?(Array)
2005
+ # return (oldV + newV).uniq
2006
+ # else
2007
+ # oldV = [oldV,newV]
2008
+ # end
2009
+ # end
2010
+ # elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
2011
+ # @items[k] = (@items[k] + [v]).uniq
2012
+ # else
2013
+ # @items[k] = [@items[k],v]
2014
+ # end
2015
+ # else
2016
+ # @items[k] = v
2017
+ # end
2018
+ # end
1818
2019
  else
1819
2020
  @items.merge!(relations)
1820
2021
  end
1821
- end
2022
+ end
2023
+
2024
+ # Internal function to concat two elements.
2025
+ # ===== Parameters
2026
+ # +itemA+:: item to be concatenated
2027
+ # +itemB+:: item to be concatenated
2028
+ # ===== Returns
2029
+ # Concatenated objects
2030
+ def concatItems(itemA,itemB)
2031
+ # A is Array :: RETURN ARRAY
2032
+ # A_array : B_array
2033
+ # A_array : B_hash => NOT ALLOWED
2034
+ # A_array : B_single => NOT ALLOWED
2035
+ # A is Hash :: RETURN HASH
2036
+ # A_hash : B_array => NOT ALLOWED
2037
+ # A_hash : B_hash
2038
+ # A_hash : B_single => NOT ALLOWED
2039
+ # A is single element => RETURN ARRAY
2040
+ # A_single : B_array
2041
+ # A_single : B_hash => NOT ALLOWED
2042
+ # A_single : B_single
2043
+ concatenated = nil
2044
+ if itemA.kind_of?(Array) && itemB.kind_of?(Array)
2045
+ concatenated = (itemA + itemB).uniq
2046
+ elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
2047
+ concatenated = itemA.merge(itemB) do |k, oldV, newV|
2048
+ self.concatItems(oldV,newV)
2049
+ end
2050
+ elsif itemB.kind_of?(Array)
2051
+ concatenated = ([itemA] + itemB).uniq
2052
+ elsif ![Array, Hash].include?(itemB.class)
2053
+ concatenated = [itemA,itemB].uniq
2054
+ end
2055
+ return concatenated
2056
+ end
1822
2057
 
1823
2058
 
1824
2059
  # Assign a dictionary already calculated as a items set.
@@ -1826,7 +2061,7 @@ class Ontology
1826
2061
  # +dictID+:: dictionary ID to be stored (:byTerm will be used)
1827
2062
  def set_items_from_dict(dictID, remove_old_relations = false)
1828
2063
  @items = {} if remove_old_relations
1829
- if(@dicts.keys.include?(dictID))
2064
+ if !@dicts[dictID].nil?
1830
2065
  @items.merge(@dicts[dictID][:byTerm])
1831
2066
  else
1832
2067
  warn('Specified ID is not calculated. Dict will not be added as a items set')
@@ -1875,7 +2110,7 @@ class Ontology
1875
2110
  curr_keys.map do |term_expand|
1876
2111
  to_infer = []
1877
2112
  # Obtain childs
1878
- childs = self.get_descendants(term_expand,true).select{|t| @items.keys.include?(t)}
2113
+ childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
1879
2114
  # Expand
1880
2115
  if childs.length > 0 && minimum_childs == 1 # Special case
1881
2116
  to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
@@ -1931,40 +2166,172 @@ class Ontology
1931
2166
  end
1932
2167
 
1933
2168
 
2169
+ # Return direct ancestors/descendants of a given term
2170
+ # ===== Parameters
2171
+ # +term+:: which are requested
2172
+ # +relation+:: can be :ancestor or :descendant
2173
+ # +remove_alternatives+:: if true, alternatives will be removed
2174
+ # ===== Returns
2175
+ # Direct ancestors/descendants of given term or nil if any error occurs
2176
+ def get_direct_related(term, relation, remove_alternatives: false)
2177
+ if @dicts[:is_a].nil?
2178
+ warn("Hierarchy dictionary is not already calculated. Returning nil")
2179
+ return nil
2180
+ end
2181
+ target = nil
2182
+ case relation
2183
+ when :ancestor
2184
+ target = :byTerm
2185
+ when :descendant
2186
+ target = :byValue
2187
+ else
2188
+ warn('Relation type not allowed. Returning nil')
2189
+ end
2190
+ return nil if target.nil?
2191
+ query = @dicts[:is_a][target][term]
2192
+ return query if query.nil?
2193
+ query, _ = remove_alternatives_from_profile(query) if remove_alternatives
2194
+ return query
2195
+ end
2196
+
2197
+
2198
+ # Return direct ancestors of a given term
2199
+ # ===== Parameters
2200
+ # +term+:: which ancestors are requested
2201
+ # +remove_alternatives+:: if true, alternatives will be removed
2202
+ # ===== Returns
2203
+ # Direct ancestors of given term or nil if any error occurs
2204
+ def get_direct_ancentors(term, remove_alternatives: false)
2205
+ return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
2206
+ end
2207
+
2208
+ # Return direct descendants of a given term
2209
+ # ===== Parameters
2210
+ # +term+:: which descendants are requested
2211
+ # +remove_alternatives+:: if true, alternatives will be removed
2212
+ # ===== Returns
2213
+ # Direct descendants of given term or nil if any error occurs
2214
+ def get_direct_descendants(term, remove_alternatives: false)
2215
+ return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
2216
+ end
2217
+
2218
+
2219
+
2220
+ #============================================================================
2221
+ #============================================================================
1934
2222
 
1935
2223
  # NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
1936
2224
  # ===== Parameters
1937
2225
  # ++::
1938
2226
  # ===== Returns
1939
2227
  # ...
1940
- def compute_relations_to_items(external_item_list, mode, thresold)
2228
+ def compute_relations_to_items(external_item_list, total_items, mode, thresold)
2229
+ terms_levels = list_terms_per_level_from_items
2230
+ #puts terms_levels.inspect.yellow
2231
+ connect_familiars!(terms_levels)
2232
+ #puts terms_levels.inspect.blue
2233
+ item_list_with_transf_parental = get_item_list_parental(terms_levels)
2234
+ results = []
2235
+ if mode == :elim
2236
+ results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
2237
+ elsif mode == :weight
2238
+ results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
2239
+ end
2240
+ return results
2241
+ end
2242
+
2243
+ def get_item_list_parental(terms_levels)
2244
+ transfered_list = {}
2245
+ parent_dict = @dicts[:is_a][:byTerm]
2246
+ levels = terms_levels.keys.sort
2247
+ while levels.length > 1
2248
+ level = levels.pop
2249
+ terms_levels[level].each do |term|
2250
+ parents = parent_dict[term]
2251
+ if parents.nil?
2252
+ next
2253
+ elsif parents.length == 1
2254
+ parent = parents.first
2255
+ else
2256
+ parent = (parents | terms_levels[level - 1]).first
2257
+ end
2258
+ term_it = @items[term]
2259
+ parent_it = @items[parent]
2260
+ curr_it = transfered_list[term]
2261
+ parent_all_items = merge_groups([term_it, parent_it, curr_it])
2262
+ transfered_list[parent] = parent_all_items if !parent_all_items.empty?
2263
+ term_all_items = merge_groups([term_it, curr_it])
2264
+ transfered_list[term] = term_all_items if !term_all_items.empty?
2265
+ end
2266
+ end
2267
+ terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
2268
+ transfered_list[term] = @items[term] if transfered_list[term].nil?
2269
+ end
2270
+ return transfered_list
2271
+ end
2272
+
2273
+ def merge_groups(groups)
2274
+ return groups.compact.inject([]){|it, a| it | a}
2275
+ end
2276
+
2277
+ def list_terms_per_level_from_items
2278
+ terms_levels = {}
2279
+ @items.each do |term, items|
2280
+ level = self.get_term_level(term)
2281
+ query = terms_levels[level]
2282
+ if query.nil?
2283
+ terms_levels[level] = [term]
2284
+ else
2285
+ query << term
2286
+ end
2287
+ end
2288
+ return terms_levels
2289
+ end
2290
+
2291
+ def connect_familiars!(terms_levels)
2292
+ levels = terms_levels.keys.sort
2293
+ while levels.length > 1 # Process when current level has a parental level
2294
+ level = levels.pop
2295
+ parental_level = level - 1
2296
+ parental_terms = terms_levels[parental_level]
2297
+ if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
2298
+ parental_terms = [] # Initialize required parental level
2299
+ terms_levels[parental_level] = parental_terms
2300
+ levels << parental_level
2301
+ end
2302
+ terms_levels[level].each do |term|
2303
+ path_info = @term_paths[term]
2304
+ shortest_path_length = path_info[:shortest_path]
2305
+ path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
2306
+ parental = path[1] # the first elements is the term itself
2307
+ parental_terms << parental if !parental_terms.include?(parental)
2308
+ end
2309
+ end
2310
+ end
2311
+
2312
+ def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
1941
2313
  results = []
1942
2314
  penalized_terms = {}
1943
- # terms_levels = get_terms_levels(@items_relations.keys)
1944
- terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
1945
- terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
1946
- terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
1947
2315
  levels = terms_levels.keys.sort
1948
2316
  levels.reverse_each do |level|
1949
2317
  terms_levels[level].each do |term|
1950
- associated_items = @items_relations[term]
1951
- if mode == :elim
1952
- items_to_remove = penalized_terms[term]
1953
- items_to_remove = [] if items_to_remove.nil?
1954
- pval = get_fisher_exact_test(
1955
- external_item_list - items_to_remove,
1956
- associated_items - items_to_remove,
1957
- ((associated_items | external_item_list) - items_to_remove).length
1958
- )
1959
- if pval <= thresold
1960
- parents = get_parents(term) # Save the items for each parent term to remove them later in the fisher test
1961
- parents.each do |prnt|
1962
- query = penalized_terms[prnt]
1963
- if query.nil?
1964
- penalized_terms[prnt] = @items_relations[term].clone # We need a new array to store the following iterations
1965
- else
1966
- query.concat(@items_relations[term])
1967
- end
2318
+ associated_items = item_list[term]
2319
+ items_to_remove = penalized_terms[term]
2320
+ items_to_remove = [] if items_to_remove.nil?
2321
+ pval = get_fisher_exact_test(
2322
+ external_item_list - items_to_remove,
2323
+ associated_items - items_to_remove,
2324
+ #((associated_items | external_item_list) - items_to_remove).length
2325
+ total_items
2326
+ )
2327
+ if pval <= thresold
2328
+ parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
2329
+ parents.each do |prnt|
2330
+ query = penalized_terms[prnt]
2331
+ if query.nil?
2332
+ penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
2333
+ else
2334
+ query.concat(item_list[term])
1968
2335
  end
1969
2336
  end
1970
2337
  end
@@ -1974,6 +2341,81 @@ class Ontology
1974
2341
  return results
1975
2342
  end
1976
2343
 
2344
+ def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
2345
+ pvals = {}
2346
+ item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
2347
+ levels = terms_levels.keys.sort
2348
+ levels.reverse_each do |level|
2349
+ terms_levels[level].each do |term|
2350
+ associated_items = item_list[term]
2351
+ #initialize observed items in item_weigths_per_term list
2352
+ add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
2353
+ children = @dicts[:is_a][:byValue][term]
2354
+ if children.nil?
2355
+ children = []
2356
+ else
2357
+ children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
2358
+ end
2359
+ computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
2360
+ end
2361
+ end
2362
+ return pvals.to_a
2363
+ end
2364
+
2365
+ def add_items_to_weigthed_list(term, associated_items, weigthed_list)
2366
+ term_weigthing = weigthed_list[term]
2367
+ associated_items.each{|ai| term_weigthing[ai] = 1}
2368
+ weigthed_list[term] = term_weigthing
2369
+ end
2370
+
2371
+ def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
2372
+ #puts term.to_s.red
2373
+ #puts @term_paths[term].inspect
2374
+ #puts @dicts[:is_a][:byValue][term].inspect.light_blue
2375
+ associated_items = item_weigths_per_term[term].keys
2376
+ pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
2377
+ 'two_sided', item_weigths_per_term[term], true)
2378
+ pvals[term] = pval
2379
+ if children.length > 0
2380
+ rates = {}
2381
+ sig_child = 0
2382
+ children.each do |child|
2383
+ ratio = sigRatio(pvals[child], pval)
2384
+ rates[child] = ratio
2385
+ sig_child += 1 if ratio >= 1
2386
+ end
2387
+ if sig_child == 0 # CASE 1
2388
+ children.each do |child|
2389
+ current_ratio = rates[child]
2390
+ query_child = item_weigths_per_term[child]
2391
+ query_child.transform_values!{|weight| weight * current_ratio}
2392
+ pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
2393
+ 'two_sided', item_weigths_per_term[child], true)
2394
+ end
2395
+ else
2396
+ ancs = get_ancestors(term, filter_alternatives = true)
2397
+ ancs << term
2398
+ rates.each do |ch, ratio|# CASE 2
2399
+ if ratio >= 1 # The child is better than parent
2400
+ ancs.each do |anc|
2401
+ query_anc = item_weigths_per_term[anc]
2402
+ associated_items.each do |item|
2403
+ query_anc[item] /= ratio # /= --> query_anc[item]/ratio
2404
+ end
2405
+ end
2406
+ end
2407
+ end
2408
+ computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
2409
+ end
2410
+ end
2411
+ end
2412
+
2413
+ def sigRatio(pvalA, pvalB)
2414
+ return Math.log(pvalA)/Math.log(pvalB)
2415
+ end
2416
+
2417
+ #============================================================================
2418
+ #============================================================================
1977
2419
 
1978
2420
  # Check if a given ID is a removable (blacklist) term.
1979
2421
  # +DEPRECATED+ use is_removable? instead
@@ -1,3 +1,3 @@
1
1
  module Semtools
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.6"
3
3
  end
data/semtools.gemspec CHANGED
@@ -34,4 +34,5 @@ Gem::Specification.new do |spec|
34
34
 
35
35
  spec.add_development_dependency "rake"
36
36
  spec.add_development_dependency "rspec"
37
+ spec.add_runtime_dependency 'colorize', '>= 0.7.3'
37
38
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: semtools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - seoanezonjic
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-03-22 00:00:00.000000000 Z
12
+ date: 2021-05-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: text
@@ -53,6 +53,20 @@ dependencies:
53
53
  - - ">="
54
54
  - !ruby/object:Gem::Version
55
55
  version: '0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: colorize
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 0.7.3
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 0.7.3
56
70
  description: This gem allows to perform ontology based operations and calculation
57
71
  of Semantic similarity and information coefficient using different implementations.
58
72
  email: