semtools 0.1.3 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/semtools/math_methods.rb +11 -3
- data/lib/semtools/ontology.rb +565 -123
- data/lib/semtools/version.rb +1 -1
- data/semtools.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 83746b4834f16f9bffc404c578be30e427bf90c7f43ba1f55bd04d94a29186c5
|
4
|
+
data.tar.gz: 25f0bd67c733d4289f2e790bc857886a184dd95677beaf378da9074957660e69
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 602fd8d61f9e9f34c2de957dd4d311a510413eea2bb0d1794b4d2da6f4e8959d3919dd8e9ea4a0d9f5d4e962443b7f50675e075d385830aeb9ef08ecb38d3fe2
|
7
|
+
data.tar.gz: 5b9f66a1fef9c3296e5fe203330e3575e4fecf0f363fc8b884abd56e8fbcb1bd5dac3563792a4a5da908bcee409e2d36596751123aa2012d5cc9a8a5ff3c7796
|
@@ -1,7 +1,8 @@
|
|
1
1
|
# TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
|
2
2
|
#to cmpute fisher exact test
|
3
3
|
#Fisher => http://www.biostathandbook.com/fishers.html
|
4
|
-
def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
|
4
|
+
def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
|
5
|
+
#puts '-', listA.inspect, listB.inspect, '-'
|
5
6
|
listA_listB = listA & listB
|
6
7
|
listA_nolistB = listA - listB
|
7
8
|
nolistA_listB = listB - listA
|
@@ -16,9 +17,16 @@ def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', w
|
|
16
17
|
listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
17
18
|
listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
18
19
|
nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
19
|
-
|
20
|
-
|
20
|
+
|
21
|
+
if partial_weigths
|
22
|
+
nolistA_nolistB_count = all_elements_count - (listA | listB).length
|
23
|
+
all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
|
24
|
+
else
|
25
|
+
nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
26
|
+
all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
|
27
|
+
end
|
21
28
|
end
|
29
|
+
#puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
|
22
30
|
if tail == 'two_sided'
|
23
31
|
accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
24
32
|
elsif tail == 'less'
|
data/lib/semtools/ontology.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'json'
|
2
|
+
require 'colorize'
|
2
3
|
|
3
4
|
|
4
5
|
class Ontology
|
@@ -38,7 +39,7 @@ class Ontology
|
|
38
39
|
# => @removable_terms :: array of terms to not be considered
|
39
40
|
# => @term_paths :: metainfo about parental paths of each term
|
40
41
|
|
41
|
-
@@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:
|
42
|
+
@@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
|
42
43
|
@@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
|
43
44
|
@@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
|
44
45
|
@@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
|
@@ -52,10 +53,11 @@ class Ontology
|
|
52
53
|
# Instantiate a OBO_Handler object
|
53
54
|
# ===== Parameters
|
54
55
|
# +file+:: with info to be loaded (.obo ; .json)
|
55
|
-
# +load_file+:: activate load process automatically
|
56
|
+
# +load_file+:: activate load process automatically
|
56
57
|
# +removable_terms+: term to be removed from calcs
|
57
58
|
# +build+: flag to launch metainfo calculation
|
58
|
-
|
59
|
+
# +file_format+: force format type despite file extension. Can be :obo or :json
|
60
|
+
def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
|
59
61
|
# Initialize object variables
|
60
62
|
@header = nil
|
61
63
|
@stanzas = {terms: {}, typedefs: {}, instances: {}}
|
@@ -74,9 +76,20 @@ class Ontology
|
|
74
76
|
@items = {}
|
75
77
|
@removable_terms = []
|
76
78
|
@term_paths = {}
|
77
|
-
# Load if proceeds
|
78
79
|
add_removable_terms(removable_terms) if !removable_terms.empty?
|
79
|
-
|
80
|
+
load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
|
81
|
+
# Load if proceeds
|
82
|
+
if load_file
|
83
|
+
fformat = file_format
|
84
|
+
fformat = File.extname(file) if fformat.nil? && !file.nil?
|
85
|
+
if fformat == :obo || fformat == ".obo"
|
86
|
+
load(file, build: build)
|
87
|
+
elsif fformat == :json || fformat == ".json"
|
88
|
+
self.read(file, build: build)
|
89
|
+
elsif !fformat.nil?
|
90
|
+
warn 'Format not allowed. Loading process will not be performed'
|
91
|
+
end
|
92
|
+
end
|
80
93
|
end
|
81
94
|
|
82
95
|
|
@@ -413,31 +426,54 @@ class Ontology
|
|
413
426
|
# +bidirectional+:: calculate bidirectional similitude. Default: false
|
414
427
|
# ===== Return
|
415
428
|
# similitude calculated
|
416
|
-
def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
|
429
|
+
def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
|
417
430
|
# Check
|
418
431
|
raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
|
419
432
|
raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
|
420
433
|
micasA = []
|
421
434
|
# Compare A -> B
|
422
435
|
termsA.each do |tA|
|
423
|
-
micas =
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
436
|
+
micas = []
|
437
|
+
termsB.each do |tB|
|
438
|
+
if store_mica
|
439
|
+
value = @mica_index.dig(tA, tB)
|
440
|
+
else
|
441
|
+
value = nil
|
442
|
+
end
|
443
|
+
if value.nil?
|
444
|
+
value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
|
445
|
+
if store_mica
|
446
|
+
value = true if value.nil? # We use true to save that the operation was made but there is not mica value
|
447
|
+
add2nestHash(@mica_index, tA, tB, value)
|
448
|
+
end
|
449
|
+
end
|
450
|
+
micas << value if value.class == Float
|
451
|
+
end
|
452
|
+
if !micas.empty?
|
453
|
+
micasA << micas.max # Obtain maximum value
|
454
|
+
else
|
455
|
+
micasA << 0
|
456
|
+
end
|
457
|
+
end
|
458
|
+
means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
|
431
459
|
# Compare B -> A
|
432
460
|
if bidirectional
|
433
461
|
means_simA = means_sim * micasA.size
|
434
|
-
means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
|
435
|
-
means_sim = (means_simA + means_simB)
|
462
|
+
means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
|
463
|
+
means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
|
436
464
|
end
|
437
465
|
# Return
|
438
466
|
return means_sim
|
439
467
|
end
|
440
468
|
|
469
|
+
def add2nestHash(h, key1, key2, val)
|
470
|
+
query1 = h[key1]
|
471
|
+
if query1.nil?
|
472
|
+
h[key1] = {key2 => val}
|
473
|
+
else
|
474
|
+
query1[key2] = val
|
475
|
+
end
|
476
|
+
end
|
441
477
|
|
442
478
|
# Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
|
443
479
|
# ===== Parameters
|
@@ -462,12 +498,13 @@ class Ontology
|
|
462
498
|
main_profiles = @profiles
|
463
499
|
end
|
464
500
|
# Compare
|
501
|
+
@mica_index = {}
|
465
502
|
while !main_ids.empty?
|
466
503
|
curr_id = main_ids.shift
|
467
504
|
current_profile = main_profiles[curr_id]
|
468
505
|
comp_ids.each do |id|
|
469
506
|
profile = comp_profiles[id]
|
470
|
-
value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
|
507
|
+
value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
|
471
508
|
query = profiles_similarity[curr_id]
|
472
509
|
if query.nil?
|
473
510
|
profiles_similarity[curr_id] = {id => value}
|
@@ -485,20 +522,23 @@ class Ontology
|
|
485
522
|
# +alt_tag+:: tag used to expand alternative IDs
|
486
523
|
# ===== Returns
|
487
524
|
# true if process ends without errors and false in other cases
|
488
|
-
def get_index_alternatives(alt_tag: @@basic_tags[:alternative]
|
525
|
+
def get_index_alternatives(alt_tag: @@basic_tags[:alternative].last)
|
489
526
|
# Check input
|
490
527
|
raise('stanzas terms empty') if @stanzas[:terms].empty?
|
491
528
|
# Take all alternative IDs
|
492
529
|
alt_ids2add = {}
|
493
530
|
@stanzas[:terms].each do |id, tags|
|
494
|
-
|
495
|
-
|
496
|
-
alt_ids =
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
531
|
+
if id == tags[:id] # Avoid simulated alternative terms
|
532
|
+
# id = tags[:id] # Take always real ID in case of alternative terms simulted
|
533
|
+
alt_ids = tags[alt_tag]
|
534
|
+
if !alt_ids.nil?
|
535
|
+
alt_ids = alt_ids - @removable_terms - [id]
|
536
|
+
# Update info
|
537
|
+
alt_ids.each do |alt_term|
|
538
|
+
@alternatives_index[alt_term] = id
|
539
|
+
alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
|
540
|
+
@ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
|
541
|
+
end
|
502
542
|
end
|
503
543
|
end
|
504
544
|
end
|
@@ -510,10 +550,11 @@ class Ontology
|
|
510
550
|
# ===== Returns
|
511
551
|
# true if eprocess ends without errors and false in other cases
|
512
552
|
def build_index()
|
513
|
-
self.get_index_alternatives
|
514
553
|
self.get_index_obsoletes
|
554
|
+
self.get_index_alternatives
|
515
555
|
self.get_index_child_parent_relations
|
516
556
|
@alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
|
557
|
+
## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
|
517
558
|
@alternatives_index.compact!
|
518
559
|
@obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
|
519
560
|
@obsoletes_index.compact!
|
@@ -536,8 +577,6 @@ class Ontology
|
|
536
577
|
if @ancestors_index.empty?
|
537
578
|
warn('ancestors_index object is empty')
|
538
579
|
else
|
539
|
-
# Prepare useful variables
|
540
|
-
alternative_terms = @alternatives_index.keys
|
541
580
|
# Per each term, add frequencies
|
542
581
|
@stanzas[:terms].each do |id, tags|
|
543
582
|
if @alternatives_index.include?(id)
|
@@ -556,8 +595,8 @@ class Ontology
|
|
556
595
|
@meta[id] = query
|
557
596
|
end
|
558
597
|
# Store metadata
|
559
|
-
query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc|
|
560
|
-
query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc|
|
598
|
+
query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
|
599
|
+
query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
|
561
600
|
query[:struct_freq] = query[:descendants] + 1.0
|
562
601
|
# Update maximums
|
563
602
|
@max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
|
@@ -582,6 +621,7 @@ class Ontology
|
|
582
621
|
# Check obsoletes
|
583
622
|
@stanzas[:terms].each do |id, term_tags|
|
584
623
|
next if term_tags.nil?
|
624
|
+
next if self.is_alternative?(id)
|
585
625
|
query = term_tags[obs_tag]
|
586
626
|
if !query.nil? && query == 'true' # Obsolete tag presence
|
587
627
|
next if !@obsoletes_index[id].nil? # Already stored
|
@@ -633,10 +673,10 @@ class Ontology
|
|
633
673
|
end
|
634
674
|
end
|
635
675
|
# Store alternatives
|
636
|
-
@alternatives_index.each do |id,alt|
|
637
|
-
|
638
|
-
|
639
|
-
end
|
676
|
+
# @alternatives_index.each do |id,alt|
|
677
|
+
# anc[id] = anc[alt] if anc.include?(alt)
|
678
|
+
# des[id] = des[alt] if des.include?(alt)
|
679
|
+
# end
|
640
680
|
# Check structure
|
641
681
|
if ![:atomic,:sparse].include? structType
|
642
682
|
structType = structType == :circular ? :circular : :hierarchical
|
@@ -704,12 +744,14 @@ class Ontology
|
|
704
744
|
# the IC calculated
|
705
745
|
def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
|
706
746
|
term = termRaw.to_sym
|
747
|
+
curr_ics = @ics[type]
|
707
748
|
# Check
|
708
749
|
raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
|
709
750
|
# Check if it's already calculated
|
710
|
-
return
|
751
|
+
return curr_ics[term] if (curr_ics.include? term) && !force
|
711
752
|
# Calculate
|
712
753
|
ic = - 1
|
754
|
+
term_meta = @meta[term]
|
713
755
|
case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
|
714
756
|
###########################################
|
715
757
|
#### STRUCTURE BASED METRICS
|
@@ -726,10 +768,10 @@ class Ontology
|
|
726
768
|
###########################################
|
727
769
|
when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
|
728
770
|
# -log(Freq(x) / Max_Freq)
|
729
|
-
ic = -Math.log10(
|
771
|
+
ic = -Math.log10(term_meta[:struct_freq].fdiv(@max_freqs[:struct_freq]))
|
730
772
|
when :resnik_observed
|
731
773
|
# -log(Freq(x) / Max_Freq)
|
732
|
-
ic = -Math.log10(
|
774
|
+
ic = -Math.log10(term_meta[:observed_freq].fdiv(@max_freqs[:observed_freq]))
|
733
775
|
# Lin
|
734
776
|
# Jiang & Conrath
|
735
777
|
|
@@ -745,17 +787,17 @@ class Ontology
|
|
745
787
|
###########################################
|
746
788
|
when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
|
747
789
|
# 1 - ( log(hypo(x) + 1) / log(max_nodes) )
|
748
|
-
ic = 1 - Math.log10(
|
790
|
+
ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
|
749
791
|
if :zhou # New Model of Semantic Similarity Measuring in Wordnet
|
750
792
|
# k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
|
751
793
|
@ics[:seco][term] = ic # Special store
|
752
|
-
ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(
|
794
|
+
ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(term_meta[:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
|
753
795
|
end
|
754
796
|
when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
|
755
|
-
ic = -Math.log10((
|
797
|
+
ic = -Math.log10((term_meta[:descendants].fdiv(term_meta[:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
|
756
798
|
# Knappe
|
757
799
|
end
|
758
|
-
|
800
|
+
curr_ics[term] = ic
|
759
801
|
return ic
|
760
802
|
end
|
761
803
|
|
@@ -788,8 +830,8 @@ class Ontology
|
|
788
830
|
# ===== Returns
|
789
831
|
# the IC of the MICA(termA,termB)
|
790
832
|
def get_ICMICA(termA, termB, ic_type = :resnik)
|
791
|
-
|
792
|
-
return
|
833
|
+
term, ic = self.get_MICA(termA, termB, ic_type)
|
834
|
+
return term.nil? ? nil : ic
|
793
835
|
end
|
794
836
|
|
795
837
|
|
@@ -812,19 +854,12 @@ class Ontology
|
|
812
854
|
# Obtain ancestors (include itselfs too)
|
813
855
|
anc_A = self.get_ancestors(termA)
|
814
856
|
anc_B = self.get_ancestors(termB)
|
815
|
-
|
816
857
|
if !(anc_A.empty? && anc_B.empty?)
|
817
858
|
anc_A << termA
|
818
859
|
anc_B << termB
|
819
|
-
# Find shared ancestors
|
820
|
-
|
821
|
-
|
822
|
-
if shared_ancestors.length > 0
|
823
|
-
shared_ancestors.each do |anc|
|
824
|
-
ic = self.get_IC(anc, type: ic_type)
|
825
|
-
# Check
|
826
|
-
mica = [anc,ic] if ic > mica[1]
|
827
|
-
end
|
860
|
+
(anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
|
861
|
+
ic = self.get_IC(anc, type: ic_type)
|
862
|
+
mica = [anc,ic] if ic > mica[1]
|
828
863
|
end
|
829
864
|
end
|
830
865
|
end
|
@@ -844,9 +879,8 @@ class Ontology
|
|
844
879
|
# Check
|
845
880
|
raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
|
846
881
|
sim = nil
|
847
|
-
|
848
|
-
|
849
|
-
if !sim_res.nil?
|
882
|
+
mica, sim_res = get_MICA(termA, termB, ic_type)
|
883
|
+
if !mica.nil?
|
850
884
|
case type
|
851
885
|
when :resnik
|
852
886
|
sim = sim_res
|
@@ -922,6 +956,16 @@ class Ontology
|
|
922
956
|
jsonFile = File.open(file)
|
923
957
|
jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
|
924
958
|
# Pre-process (Symbolize some hashs values)
|
959
|
+
if !jsonInfo[:header].nil?
|
960
|
+
aux = jsonInfo[:header].map do |entry,info|
|
961
|
+
if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
|
962
|
+
[entry,info.map{|item| item.to_sym}]
|
963
|
+
else
|
964
|
+
[entry,info]
|
965
|
+
end
|
966
|
+
end
|
967
|
+
jsonInfo[:header] = aux.to_h
|
968
|
+
end
|
925
969
|
jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
|
926
970
|
jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
|
927
971
|
jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
|
@@ -1106,7 +1150,7 @@ class Ontology
|
|
1106
1150
|
if checked.nil?
|
1107
1151
|
t
|
1108
1152
|
else
|
1109
|
-
byValue[checked] = byValue.delete(t) if checked != t &&
|
1153
|
+
byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
|
1110
1154
|
checked
|
1111
1155
|
end
|
1112
1156
|
end
|
@@ -1134,7 +1178,8 @@ class Ontology
|
|
1134
1178
|
else
|
1135
1179
|
aux = self.extract_id(referenceValue)
|
1136
1180
|
end
|
1137
|
-
|
1181
|
+
aux.compact! unless aux.nil?
|
1182
|
+
referenceValue = aux unless aux.nil?
|
1138
1183
|
end
|
1139
1184
|
referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
|
1140
1185
|
byTerm[term] = referenceValue + (values - referenceValue)
|
@@ -1525,6 +1570,7 @@ class Ontology
|
|
1525
1570
|
# ===== Returns
|
1526
1571
|
# cleaned profile
|
1527
1572
|
def clean_profile(profile, remove_alternatives: true)
|
1573
|
+
warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
|
1528
1574
|
terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
|
1529
1575
|
if remove_alternatives
|
1530
1576
|
terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
|
@@ -1534,6 +1580,43 @@ class Ontology
|
|
1534
1580
|
return terms_without_ancestors_and_alternatices
|
1535
1581
|
end
|
1536
1582
|
|
1583
|
+
def clean_profile_hard(profile)
|
1584
|
+
profile, _ = check_ids(profile)
|
1585
|
+
profile = profile.select{|t| !is_obsolete?(t)}
|
1586
|
+
profile = clean_profile(profile.uniq)
|
1587
|
+
return profile
|
1588
|
+
end
|
1589
|
+
|
1590
|
+
# Remove terms from a given profile using hierarchical info and scores set given
|
1591
|
+
# ===== Parameters
|
1592
|
+
# +profile+:: profile to be cleaned
|
1593
|
+
# +scores+:: hash with terms by keys and numerical values (scores)
|
1594
|
+
# +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
|
1595
|
+
# +remove_without_score+:: if true, terms without score will be removed. Default: true
|
1596
|
+
# ===== Returns
|
1597
|
+
# cleaned profile
|
1598
|
+
def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
|
1599
|
+
scores = scores.sort_by{|term,score| score}.to_h
|
1600
|
+
keep = profile.map do |term|
|
1601
|
+
if scores.include?(term)
|
1602
|
+
parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
|
1603
|
+
targetable = parentals.select{|parent| profile.include?(parent)}
|
1604
|
+
if targetable.empty?
|
1605
|
+
term
|
1606
|
+
else
|
1607
|
+
targetable << term
|
1608
|
+
targets = scores.select{|term,score| targetable.include?(term)}.to_h
|
1609
|
+
byMax ? targets.keys.last : targets.keys.first
|
1610
|
+
end
|
1611
|
+
elsif remove_without_score
|
1612
|
+
nil
|
1613
|
+
else
|
1614
|
+
term
|
1615
|
+
end
|
1616
|
+
end
|
1617
|
+
return keep.compact.uniq
|
1618
|
+
end
|
1619
|
+
|
1537
1620
|
|
1538
1621
|
# Remove alternatives (if official term is present) and ancestors terms of stored profiles
|
1539
1622
|
# ===== Parameters
|
@@ -1635,44 +1718,45 @@ class Ontology
|
|
1635
1718
|
|
1636
1719
|
# Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
|
1637
1720
|
# Also calculates paths metadata and stores into @term_paths
|
1638
|
-
def calc_term_paths
|
1639
|
-
self.calc_ancestors_dictionary if
|
1640
|
-
visited_terms =
|
1721
|
+
def calc_term_paths(only_main_terms=false)
|
1722
|
+
self.calc_ancestors_dictionary if @dicts[:is_a].nil? # Calculate direct parentals dictionary if it's not already calculated
|
1723
|
+
visited_terms = {} # PEDRO: To keep track of visited data, hash accesions are fast than array includes. I don't understant why use this variable instead of check @term_paths to see if the data is calculated
|
1641
1724
|
@term_paths = {}
|
1642
1725
|
if [:hierarchical, :sparse].include? @structureType
|
1643
|
-
|
1644
|
-
|
1645
|
-
if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
|
1726
|
+
@stanzas[:terms].each do |term, t_attributes|
|
1727
|
+
if !only_main_terms && (self.is_obsolete?(term) || self.is_alternative?(term)) # Special case (obsoletes)
|
1646
1728
|
special_term = term
|
1647
1729
|
term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
|
1648
1730
|
@term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
|
1649
1731
|
@term_paths[special_term] = @term_paths[term]
|
1650
|
-
visited_terms
|
1732
|
+
visited_terms[special_term] = true
|
1651
1733
|
end
|
1652
|
-
|
1653
1734
|
if !visited_terms.include?(term)
|
1654
|
-
|
1735
|
+
# PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
|
1736
|
+
path_attr = @term_paths[term]
|
1737
|
+
if path_attr.nil?
|
1738
|
+
path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
|
1739
|
+
@term_paths[term] = path_attr #save path data container
|
1740
|
+
end
|
1655
1741
|
parentals = @dicts[:is_a][:byTerm][term]
|
1656
1742
|
if parentals.nil?
|
1657
|
-
|
1743
|
+
path_attr[:paths] << [term]
|
1658
1744
|
else
|
1659
1745
|
parentals.each do |direct_parental|
|
1660
|
-
|
1661
|
-
|
1662
|
-
|
1663
|
-
self.expand_path(direct_parental, visited_terms)
|
1664
|
-
new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
|
1665
|
-
end
|
1666
|
-
new_paths.each{|path| @term_paths[term][:paths] << path}
|
1746
|
+
self.expand_path(direct_parental)
|
1747
|
+
new_paths = @term_paths[direct_parental][:paths]
|
1748
|
+
path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
|
1667
1749
|
end
|
1668
|
-
end
|
1669
|
-
visited_terms
|
1750
|
+
end
|
1751
|
+
anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
|
1752
|
+
visited_terms[term] = true
|
1670
1753
|
end
|
1671
1754
|
# Update metadata
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1755
|
+
path_attr = @term_paths[term]
|
1756
|
+
path_attr[:total_paths] = path_attr[:paths].length
|
1757
|
+
paths_sizes = path_attr[:paths].map{|path| path.length}
|
1758
|
+
path_attr[:largest_path] = paths_sizes.max
|
1759
|
+
path_attr[:shortest_path] = paths_sizes.min
|
1676
1760
|
end
|
1677
1761
|
else
|
1678
1762
|
warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
|
@@ -1684,20 +1768,25 @@ class Ontology
|
|
1684
1768
|
# ===== Parameters
|
1685
1769
|
# +curr_term+:: current visited term
|
1686
1770
|
# +visited_terms+:: already expanded terms
|
1687
|
-
def expand_path(curr_term
|
1688
|
-
if
|
1689
|
-
|
1771
|
+
def expand_path(curr_term)
|
1772
|
+
if !@term_paths.include?(curr_term)
|
1773
|
+
path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
|
1774
|
+
@term_paths[curr_term] = path_attr
|
1690
1775
|
direct_parentals = @dicts[:is_a][:byTerm][curr_term]
|
1691
1776
|
if direct_parentals.nil? # No parents :: End of recurrence
|
1692
|
-
|
1777
|
+
path_attr[:paths] << [curr_term]
|
1693
1778
|
else # Expand and concat
|
1694
1779
|
direct_parentals.each do |ancestor|
|
1695
|
-
|
1696
|
-
|
1697
|
-
|
1780
|
+
path_attr_parental = @term_paths[ancestor]
|
1781
|
+
if path_attr_parental.nil? # Calculate new paths
|
1782
|
+
self.expand_path(ancestor)
|
1783
|
+
new_paths = @term_paths[ancestor][:paths]
|
1784
|
+
else # Use direct_parental paths already calculated
|
1785
|
+
new_paths = path_attr_parental[:paths]
|
1786
|
+
end
|
1787
|
+
path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
|
1698
1788
|
end
|
1699
1789
|
end
|
1700
|
-
visited_terms << curr_term
|
1701
1790
|
end
|
1702
1791
|
end
|
1703
1792
|
|
@@ -1717,6 +1806,26 @@ class Ontology
|
|
1717
1806
|
return @dicts[:level][:byValue][term]
|
1718
1807
|
end
|
1719
1808
|
|
1809
|
+
# nil, term not found, [] term exists but not has parents
|
1810
|
+
def get_parental_path(term, which_path = :shortest_path, level = 0)
|
1811
|
+
path = nil
|
1812
|
+
path_attr = @term_paths[term]
|
1813
|
+
if !path_attr.nil?
|
1814
|
+
path_length = path_attr[which_path]
|
1815
|
+
all_paths = path_attr[:paths]
|
1816
|
+
if all_paths.empty?
|
1817
|
+
path = []
|
1818
|
+
else
|
1819
|
+
path = all_paths.select{|pt| pt.length == path_length}.first.clone
|
1820
|
+
if level > 0 # we want the term and his ascendants until a specific level
|
1821
|
+
n_parents = path_length - level
|
1822
|
+
path = path[0..n_parents]
|
1823
|
+
end
|
1824
|
+
path.shift # Discard the term itself
|
1825
|
+
end
|
1826
|
+
end
|
1827
|
+
return path
|
1828
|
+
end
|
1720
1829
|
|
1721
1830
|
# Return ontology levels from profile terms
|
1722
1831
|
# ===== Returns
|
@@ -1737,6 +1846,83 @@ class Ontology
|
|
1737
1846
|
return levels_filtered
|
1738
1847
|
end
|
1739
1848
|
|
1849
|
+
def get_profile_ontology_distribution_tables
|
1850
|
+
cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
|
1851
|
+
uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
|
1852
|
+
hpo_ontology_levels = get_ontology_levels
|
1853
|
+
total_ontology_terms = hpo_ontology_levels.values.flatten.length
|
1854
|
+
total_cohort_terms = cohort_ontology_levels.values.flatten.length
|
1855
|
+
total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
|
1856
|
+
|
1857
|
+
ontology_levels = []
|
1858
|
+
distribution_percentage = []
|
1859
|
+
hpo_ontology_levels.each do |level, terms|
|
1860
|
+
cohort_terms = cohort_ontology_levels[level]
|
1861
|
+
uniq_cohort_terms = uniq_cohort_ontology_levels[level]
|
1862
|
+
if cohort_terms.nil? || uniq_cohort_terms.nil?
|
1863
|
+
num = 0
|
1864
|
+
u_num = 0
|
1865
|
+
else
|
1866
|
+
num = cohort_terms.length
|
1867
|
+
u_num = uniq_cohort_terms.length
|
1868
|
+
end
|
1869
|
+
ontology_levels << [level, terms.length, num]
|
1870
|
+
distribution_percentage << [
|
1871
|
+
level,
|
1872
|
+
(terms.length.fdiv(total_ontology_terms)*100).round(3),
|
1873
|
+
(num.fdiv(total_cohort_terms)*100).round(3),
|
1874
|
+
(u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
|
1875
|
+
]
|
1876
|
+
end
|
1877
|
+
ontology_levels.sort! { |x,y| x.first <=> y.first }
|
1878
|
+
distribution_percentage.sort! { |x,y| x.first <=> y.first }
|
1879
|
+
return ontology_levels, distribution_percentage
|
1880
|
+
end
|
1881
|
+
|
1882
|
+
def get_dataset_specifity_index(mode)
|
1883
|
+
ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
|
1884
|
+
if mode == 'uniq'
|
1885
|
+
observed_distribution = 3
|
1886
|
+
elsif mode == 'weigthed'
|
1887
|
+
observed_distribution = 2
|
1888
|
+
end
|
1889
|
+
max_terms = distribution_percentage.map{|row| row[1]}.max
|
1890
|
+
maxL = nil
|
1891
|
+
distribution_percentage.each do |level_info|
|
1892
|
+
maxL = level_info.first if level_info[1] == max_terms
|
1893
|
+
end
|
1894
|
+
diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
|
1895
|
+
diffL.select!{|dL| dL.last > 0}
|
1896
|
+
lowSection = diffL.select{|dL| dL.first <= maxL}
|
1897
|
+
highSection = diffL.select{|dL| dL.first > maxL}
|
1898
|
+
dsi = nil
|
1899
|
+
if highSection.empty?
|
1900
|
+
dsi = 0
|
1901
|
+
else
|
1902
|
+
accumulated_weigth = 0
|
1903
|
+
accumulated_weigthed_diffL = 0
|
1904
|
+
hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
|
1905
|
+
lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
|
1906
|
+
dsi = hss.fdiv(lss)
|
1907
|
+
end
|
1908
|
+
return dsi
|
1909
|
+
end
|
1910
|
+
|
1911
|
+
def get_weigthed_level_contribution(section, maxL, nLevels)
|
1912
|
+
accumulated_weigthed_diffL = 0
|
1913
|
+
section.each do |level, diff|
|
1914
|
+
weightL = maxL - level
|
1915
|
+
if weightL >= 0
|
1916
|
+
weightL += 1
|
1917
|
+
else
|
1918
|
+
weightL = weightL.abs
|
1919
|
+
end
|
1920
|
+
accumulated_weigthed_diffL += diff * weightL
|
1921
|
+
end
|
1922
|
+
weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
|
1923
|
+
return weigthed_contribution
|
1924
|
+
end
|
1925
|
+
|
1740
1926
|
|
1741
1927
|
# Calculate profiles dictionary with Key= Term; Value = Profiles
|
1742
1928
|
def calc_profiles_dictionary
|
@@ -1808,17 +1994,66 @@ class Ontology
|
|
1808
1994
|
end
|
1809
1995
|
end
|
1810
1996
|
if expand
|
1811
|
-
|
1812
|
-
|
1813
|
-
|
1814
|
-
|
1815
|
-
|
1816
|
-
|
1817
|
-
|
1997
|
+
@items = self.concatItems(@items,relations)
|
1998
|
+
# relations.each do |k,v| # MUST UPDATE THIS USING A CONCAT SPECIFIC FUNCTION
|
1999
|
+
# if @items.keys.include?(k)
|
2000
|
+
# if v.kind_of?(Array)
|
2001
|
+
# @items[k] = (@items[k] + v).uniq
|
2002
|
+
# elsif v.kind_of?(Hash)
|
2003
|
+
# @items.merge!(relations) do |k, oldV, newV|
|
2004
|
+
# if oldV.kind_of?(Array)
|
2005
|
+
# return (oldV + newV).uniq
|
2006
|
+
# else
|
2007
|
+
# oldV = [oldV,newV]
|
2008
|
+
# end
|
2009
|
+
# end
|
2010
|
+
# elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
|
2011
|
+
# @items[k] = (@items[k] + [v]).uniq
|
2012
|
+
# else
|
2013
|
+
# @items[k] = [@items[k],v]
|
2014
|
+
# end
|
2015
|
+
# else
|
2016
|
+
# @items[k] = v
|
2017
|
+
# end
|
2018
|
+
# end
|
1818
2019
|
else
|
1819
2020
|
@items.merge!(relations)
|
1820
2021
|
end
|
1821
|
-
end
|
2022
|
+
end
|
2023
|
+
|
2024
|
+
# Internal function to concat two elements.
|
2025
|
+
# ===== Parameters
|
2026
|
+
# +itemA+:: item to be concatenated
|
2027
|
+
# +itemB+:: item to be concatenated
|
2028
|
+
# ===== Returns
|
2029
|
+
# Concatenated objects
|
2030
|
+
def concatItems(itemA,itemB)
|
2031
|
+
# A is Array :: RETURN ARRAY
|
2032
|
+
# A_array : B_array
|
2033
|
+
# A_array : B_hash => NOT ALLOWED
|
2034
|
+
# A_array : B_single => NOT ALLOWED
|
2035
|
+
# A is Hash :: RETURN HASH
|
2036
|
+
# A_hash : B_array => NOT ALLOWED
|
2037
|
+
# A_hash : B_hash
|
2038
|
+
# A_hash : B_single => NOT ALLOWED
|
2039
|
+
# A is single element => RETURN ARRAY
|
2040
|
+
# A_single : B_array
|
2041
|
+
# A_single : B_hash => NOT ALLOWED
|
2042
|
+
# A_single : B_single
|
2043
|
+
concatenated = nil
|
2044
|
+
if itemA.kind_of?(Array) && itemB.kind_of?(Array)
|
2045
|
+
concatenated = (itemA + itemB).uniq
|
2046
|
+
elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
|
2047
|
+
concatenated = itemA.merge(itemB) do |k, oldV, newV|
|
2048
|
+
self.concatItems(oldV,newV)
|
2049
|
+
end
|
2050
|
+
elsif itemB.kind_of?(Array)
|
2051
|
+
concatenated = ([itemA] + itemB).uniq
|
2052
|
+
elsif ![Array, Hash].include?(itemB.class)
|
2053
|
+
concatenated = [itemA,itemB].uniq
|
2054
|
+
end
|
2055
|
+
return concatenated
|
2056
|
+
end
|
1822
2057
|
|
1823
2058
|
|
1824
2059
|
# Assign a dictionary already calculated as a items set.
|
@@ -1826,7 +2061,7 @@ class Ontology
|
|
1826
2061
|
# +dictID+:: dictionary ID to be stored (:byTerm will be used)
|
1827
2062
|
def set_items_from_dict(dictID, remove_old_relations = false)
|
1828
2063
|
@items = {} if remove_old_relations
|
1829
|
-
if
|
2064
|
+
if !@dicts[dictID].nil?
|
1830
2065
|
@items.merge(@dicts[dictID][:byTerm])
|
1831
2066
|
else
|
1832
2067
|
warn('Specified ID is not calculated. Dict will not be added as a items set')
|
@@ -1875,7 +2110,7 @@ class Ontology
|
|
1875
2110
|
curr_keys.map do |term_expand|
|
1876
2111
|
to_infer = []
|
1877
2112
|
# Obtain childs
|
1878
|
-
childs = self.get_descendants(term_expand,true).select{|t|
|
2113
|
+
childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
|
1879
2114
|
# Expand
|
1880
2115
|
if childs.length > 0 && minimum_childs == 1 # Special case
|
1881
2116
|
to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
|
@@ -1931,40 +2166,172 @@ class Ontology
|
|
1931
2166
|
end
|
1932
2167
|
|
1933
2168
|
|
2169
|
+
# Return direct ancestors/descendants of a given term
|
2170
|
+
# ===== Parameters
|
2171
|
+
# +term+:: which are requested
|
2172
|
+
# +relation+:: can be :ancestor or :descendant
|
2173
|
+
# +remove_alternatives+:: if true, alternatives will be removed
|
2174
|
+
# ===== Returns
|
2175
|
+
# Direct ancestors/descendants of given term or nil if any error occurs
|
2176
|
+
def get_direct_related(term, relation, remove_alternatives: false)
|
2177
|
+
if @dicts[:is_a].nil?
|
2178
|
+
warn("Hierarchy dictionary is not already calculated. Returning nil")
|
2179
|
+
return nil
|
2180
|
+
end
|
2181
|
+
target = nil
|
2182
|
+
case relation
|
2183
|
+
when :ancestor
|
2184
|
+
target = :byTerm
|
2185
|
+
when :descendant
|
2186
|
+
target = :byValue
|
2187
|
+
else
|
2188
|
+
warn('Relation type not allowed. Returning nil')
|
2189
|
+
end
|
2190
|
+
return nil if target.nil?
|
2191
|
+
query = @dicts[:is_a][target][term]
|
2192
|
+
return query if query.nil?
|
2193
|
+
query, _ = remove_alternatives_from_profile(query) if remove_alternatives
|
2194
|
+
return query
|
2195
|
+
end
|
2196
|
+
|
2197
|
+
|
2198
|
+
# Return direct ancestors of a given term
|
2199
|
+
# ===== Parameters
|
2200
|
+
# +term+:: which ancestors are requested
|
2201
|
+
# +remove_alternatives+:: if true, alternatives will be removed
|
2202
|
+
# ===== Returns
|
2203
|
+
# Direct ancestors of given term or nil if any error occurs
|
2204
|
+
def get_direct_ancentors(term, remove_alternatives: false)
|
2205
|
+
return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
|
2206
|
+
end
|
2207
|
+
|
2208
|
+
# Return direct descendants of a given term
|
2209
|
+
# ===== Parameters
|
2210
|
+
# +term+:: which descendants are requested
|
2211
|
+
# +remove_alternatives+:: if true, alternatives will be removed
|
2212
|
+
# ===== Returns
|
2213
|
+
# Direct descendants of given term or nil if any error occurs
|
2214
|
+
def get_direct_descendants(term, remove_alternatives: false)
|
2215
|
+
return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
|
2216
|
+
end
|
2217
|
+
|
2218
|
+
|
2219
|
+
|
2220
|
+
#============================================================================
|
2221
|
+
#============================================================================
|
1934
2222
|
|
1935
2223
|
# NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
|
1936
2224
|
# ===== Parameters
|
1937
2225
|
# ++::
|
1938
2226
|
# ===== Returns
|
1939
2227
|
# ...
|
1940
|
-
def compute_relations_to_items(external_item_list, mode, thresold)
|
2228
|
+
def compute_relations_to_items(external_item_list, total_items, mode, thresold)
|
2229
|
+
terms_levels = list_terms_per_level_from_items
|
2230
|
+
#puts terms_levels.inspect.yellow
|
2231
|
+
connect_familiars!(terms_levels)
|
2232
|
+
#puts terms_levels.inspect.blue
|
2233
|
+
item_list_with_transf_parental = get_item_list_parental(terms_levels)
|
2234
|
+
results = []
|
2235
|
+
if mode == :elim
|
2236
|
+
results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
|
2237
|
+
elsif mode == :weight
|
2238
|
+
results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
|
2239
|
+
end
|
2240
|
+
return results
|
2241
|
+
end
|
2242
|
+
|
2243
|
+
def get_item_list_parental(terms_levels)
|
2244
|
+
transfered_list = {}
|
2245
|
+
parent_dict = @dicts[:is_a][:byTerm]
|
2246
|
+
levels = terms_levels.keys.sort
|
2247
|
+
while levels.length > 1
|
2248
|
+
level = levels.pop
|
2249
|
+
terms_levels[level].each do |term|
|
2250
|
+
parents = parent_dict[term]
|
2251
|
+
if parents.nil?
|
2252
|
+
next
|
2253
|
+
elsif parents.length == 1
|
2254
|
+
parent = parents.first
|
2255
|
+
else
|
2256
|
+
parent = (parents | terms_levels[level - 1]).first
|
2257
|
+
end
|
2258
|
+
term_it = @items[term]
|
2259
|
+
parent_it = @items[parent]
|
2260
|
+
curr_it = transfered_list[term]
|
2261
|
+
parent_all_items = merge_groups([term_it, parent_it, curr_it])
|
2262
|
+
transfered_list[parent] = parent_all_items if !parent_all_items.empty?
|
2263
|
+
term_all_items = merge_groups([term_it, curr_it])
|
2264
|
+
transfered_list[term] = term_all_items if !term_all_items.empty?
|
2265
|
+
end
|
2266
|
+
end
|
2267
|
+
terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
|
2268
|
+
transfered_list[term] = @items[term] if transfered_list[term].nil?
|
2269
|
+
end
|
2270
|
+
return transfered_list
|
2271
|
+
end
|
2272
|
+
|
2273
|
+
def merge_groups(groups)
|
2274
|
+
return groups.compact.inject([]){|it, a| it | a}
|
2275
|
+
end
|
2276
|
+
|
2277
|
+
def list_terms_per_level_from_items
|
2278
|
+
terms_levels = {}
|
2279
|
+
@items.each do |term, items|
|
2280
|
+
level = self.get_term_level(term)
|
2281
|
+
query = terms_levels[level]
|
2282
|
+
if query.nil?
|
2283
|
+
terms_levels[level] = [term]
|
2284
|
+
else
|
2285
|
+
query << term
|
2286
|
+
end
|
2287
|
+
end
|
2288
|
+
return terms_levels
|
2289
|
+
end
|
2290
|
+
|
2291
|
+
def connect_familiars!(terms_levels)
|
2292
|
+
levels = terms_levels.keys.sort
|
2293
|
+
while levels.length > 1 # Process when current level has a parental level
|
2294
|
+
level = levels.pop
|
2295
|
+
parental_level = level - 1
|
2296
|
+
parental_terms = terms_levels[parental_level]
|
2297
|
+
if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
|
2298
|
+
parental_terms = [] # Initialize required parental level
|
2299
|
+
terms_levels[parental_level] = parental_terms
|
2300
|
+
levels << parental_level
|
2301
|
+
end
|
2302
|
+
terms_levels[level].each do |term|
|
2303
|
+
path_info = @term_paths[term]
|
2304
|
+
shortest_path_length = path_info[:shortest_path]
|
2305
|
+
path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
|
2306
|
+
parental = path[1] # the first elements is the term itself
|
2307
|
+
parental_terms << parental if !parental_terms.include?(parental)
|
2308
|
+
end
|
2309
|
+
end
|
2310
|
+
end
|
2311
|
+
|
2312
|
+
def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
|
1941
2313
|
results = []
|
1942
2314
|
penalized_terms = {}
|
1943
|
-
# terms_levels = get_terms_levels(@items_relations.keys)
|
1944
|
-
terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
|
1945
|
-
terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
|
1946
|
-
terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
|
1947
2315
|
levels = terms_levels.keys.sort
|
1948
2316
|
levels.reverse_each do |level|
|
1949
2317
|
terms_levels[level].each do |term|
|
1950
|
-
associated_items =
|
1951
|
-
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1955
|
-
|
1956
|
-
|
1957
|
-
|
1958
|
-
|
1959
|
-
|
1960
|
-
|
1961
|
-
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1965
|
-
|
1966
|
-
|
1967
|
-
end
|
2318
|
+
associated_items = item_list[term]
|
2319
|
+
items_to_remove = penalized_terms[term]
|
2320
|
+
items_to_remove = [] if items_to_remove.nil?
|
2321
|
+
pval = get_fisher_exact_test(
|
2322
|
+
external_item_list - items_to_remove,
|
2323
|
+
associated_items - items_to_remove,
|
2324
|
+
#((associated_items | external_item_list) - items_to_remove).length
|
2325
|
+
total_items
|
2326
|
+
)
|
2327
|
+
if pval <= thresold
|
2328
|
+
parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
|
2329
|
+
parents.each do |prnt|
|
2330
|
+
query = penalized_terms[prnt]
|
2331
|
+
if query.nil?
|
2332
|
+
penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
|
2333
|
+
else
|
2334
|
+
query.concat(item_list[term])
|
1968
2335
|
end
|
1969
2336
|
end
|
1970
2337
|
end
|
@@ -1974,6 +2341,81 @@ class Ontology
|
|
1974
2341
|
return results
|
1975
2342
|
end
|
1976
2343
|
|
2344
|
+
def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
|
2345
|
+
pvals = {}
|
2346
|
+
item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
|
2347
|
+
levels = terms_levels.keys.sort
|
2348
|
+
levels.reverse_each do |level|
|
2349
|
+
terms_levels[level].each do |term|
|
2350
|
+
associated_items = item_list[term]
|
2351
|
+
#initialize observed items in item_weigths_per_term list
|
2352
|
+
add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
|
2353
|
+
children = @dicts[:is_a][:byValue][term]
|
2354
|
+
if children.nil?
|
2355
|
+
children = []
|
2356
|
+
else
|
2357
|
+
children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
|
2358
|
+
end
|
2359
|
+
computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
|
2360
|
+
end
|
2361
|
+
end
|
2362
|
+
return pvals.to_a
|
2363
|
+
end
|
2364
|
+
|
2365
|
+
def add_items_to_weigthed_list(term, associated_items, weigthed_list)
|
2366
|
+
term_weigthing = weigthed_list[term]
|
2367
|
+
associated_items.each{|ai| term_weigthing[ai] = 1}
|
2368
|
+
weigthed_list[term] = term_weigthing
|
2369
|
+
end
|
2370
|
+
|
2371
|
+
def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
|
2372
|
+
#puts term.to_s.red
|
2373
|
+
#puts @term_paths[term].inspect
|
2374
|
+
#puts @dicts[:is_a][:byValue][term].inspect.light_blue
|
2375
|
+
associated_items = item_weigths_per_term[term].keys
|
2376
|
+
pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
|
2377
|
+
'two_sided', item_weigths_per_term[term], true)
|
2378
|
+
pvals[term] = pval
|
2379
|
+
if children.length > 0
|
2380
|
+
rates = {}
|
2381
|
+
sig_child = 0
|
2382
|
+
children.each do |child|
|
2383
|
+
ratio = sigRatio(pvals[child], pval)
|
2384
|
+
rates[child] = ratio
|
2385
|
+
sig_child += 1 if ratio >= 1
|
2386
|
+
end
|
2387
|
+
if sig_child == 0 # CASE 1
|
2388
|
+
children.each do |child|
|
2389
|
+
current_ratio = rates[child]
|
2390
|
+
query_child = item_weigths_per_term[child]
|
2391
|
+
query_child.transform_values!{|weight| weight * current_ratio}
|
2392
|
+
pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
|
2393
|
+
'two_sided', item_weigths_per_term[child], true)
|
2394
|
+
end
|
2395
|
+
else
|
2396
|
+
ancs = get_ancestors(term, filter_alternatives = true)
|
2397
|
+
ancs << term
|
2398
|
+
rates.each do |ch, ratio|# CASE 2
|
2399
|
+
if ratio >= 1 # The child is better than parent
|
2400
|
+
ancs.each do |anc|
|
2401
|
+
query_anc = item_weigths_per_term[anc]
|
2402
|
+
associated_items.each do |item|
|
2403
|
+
query_anc[item] /= ratio # /= --> query_anc[item]/ratio
|
2404
|
+
end
|
2405
|
+
end
|
2406
|
+
end
|
2407
|
+
end
|
2408
|
+
computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
|
2409
|
+
end
|
2410
|
+
end
|
2411
|
+
end
|
2412
|
+
|
2413
|
+
def sigRatio(pvalA, pvalB)
|
2414
|
+
return Math.log(pvalA)/Math.log(pvalB)
|
2415
|
+
end
|
2416
|
+
|
2417
|
+
#============================================================================
|
2418
|
+
#============================================================================
|
1977
2419
|
|
1978
2420
|
# Check if a given ID is a removable (blacklist) term.
|
1979
2421
|
# +DEPRECATED+ use is_removable? instead
|
data/lib/semtools/version.rb
CHANGED
data/semtools.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: semtools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- seoanezonjic
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-05-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: text
|
@@ -53,6 +53,20 @@ dependencies:
|
|
53
53
|
- - ">="
|
54
54
|
- !ruby/object:Gem::Version
|
55
55
|
version: '0'
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: colorize
|
58
|
+
requirement: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 0.7.3
|
63
|
+
type: :runtime
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.7.3
|
56
70
|
description: This gem allows to perform ontology based operations and calculation
|
57
71
|
of Semantic similarity and information coefficient using different implementations.
|
58
72
|
email:
|