semtools 0.1.3 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/semtools/math_methods.rb +11 -3
- data/lib/semtools/ontology.rb +565 -123
- data/lib/semtools/version.rb +1 -1
- data/semtools.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 83746b4834f16f9bffc404c578be30e427bf90c7f43ba1f55bd04d94a29186c5
|
4
|
+
data.tar.gz: 25f0bd67c733d4289f2e790bc857886a184dd95677beaf378da9074957660e69
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 602fd8d61f9e9f34c2de957dd4d311a510413eea2bb0d1794b4d2da6f4e8959d3919dd8e9ea4a0d9f5d4e962443b7f50675e075d385830aeb9ef08ecb38d3fe2
|
7
|
+
data.tar.gz: 5b9f66a1fef9c3296e5fe203330e3575e4fecf0f363fc8b884abd56e8fbcb1bd5dac3563792a4a5da908bcee409e2d36596751123aa2012d5cc9a8a5ff3c7796
|
@@ -1,7 +1,8 @@
|
|
1
1
|
# TODO: Make a pull request to https://rubygems.org/gems/ruby-statistics, with all the statistic code implemented here.
|
2
2
|
#to cmpute fisher exact test
|
3
3
|
#Fisher => http://www.biostathandbook.com/fishers.html
|
4
|
-
def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil)
|
4
|
+
def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', weigths=nil, partial_weigths=true)
|
5
|
+
#puts '-', listA.inspect, listB.inspect, '-'
|
5
6
|
listA_listB = listA & listB
|
6
7
|
listA_nolistB = listA - listB
|
7
8
|
nolistA_listB = listB - listA
|
@@ -16,9 +17,16 @@ def get_fisher_exact_test(listA, listB, all_elements_count, tail ='two_sided', w
|
|
16
17
|
listA_listB_count = listA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
17
18
|
listA_nolistB_count = listA_nolistB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
18
19
|
nolistA_listB_count = nolistA_listB.map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
19
|
-
|
20
|
-
|
20
|
+
|
21
|
+
if partial_weigths
|
22
|
+
nolistA_nolistB_count = all_elements_count - (listA | listB).length
|
23
|
+
all_elements_count = nolistA_nolistB_count + listA_listB_count + listA_nolistB_count + nolistA_listB_count
|
24
|
+
else
|
25
|
+
nolistA_nolistB_count = (weigths.keys - (listA | listB)).map{|i| weigths[i]}.inject(0){|sum, n| sum + n}.ceil
|
26
|
+
all_elements_count = weigths.values.inject(0){|sum, n| sum + n}.ceil
|
27
|
+
end
|
21
28
|
end
|
29
|
+
#puts [listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count].inspect
|
22
30
|
if tail == 'two_sided'
|
23
31
|
accumulated_prob = get_two_tail(listA_listB_count, listA_nolistB_count, nolistA_listB_count, nolistA_nolistB_count, all_elements_count)
|
24
32
|
elsif tail == 'less'
|
data/lib/semtools/ontology.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'json'
|
2
|
+
require 'colorize'
|
2
3
|
|
3
4
|
|
4
5
|
class Ontology
|
@@ -38,7 +39,7 @@ class Ontology
|
|
38
39
|
# => @removable_terms :: array of terms to not be considered
|
39
40
|
# => @term_paths :: metainfo about parental paths of each term
|
40
41
|
|
41
|
-
@@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:
|
42
|
+
@@basic_tags = {ancestors: [:is_a], obsolete: :is_obsolete, alternative: [:replaced_by,:consider,:alt_id]}
|
42
43
|
@@allowed_calcs = {ics: [:resnik, :resnik_observed, :seco, :zhou, :sanchez], sims: [:resnik, :lin, :jiang_conrath]}
|
43
44
|
@@symbolizable_ids = [:id, :alt_id, :replaced_by, :consider]
|
44
45
|
@@tags_with_trailing_modifiers = [:is_a, :union_of, :disjoint_from, :relationship, :subsetdef, :synonymtypedef, :property_value]
|
@@ -52,10 +53,11 @@ class Ontology
|
|
52
53
|
# Instantiate a OBO_Handler object
|
53
54
|
# ===== Parameters
|
54
55
|
# +file+:: with info to be loaded (.obo ; .json)
|
55
|
-
# +load_file+:: activate load process automatically
|
56
|
+
# +load_file+:: activate load process automatically
|
56
57
|
# +removable_terms+: term to be removed from calcs
|
57
58
|
# +build+: flag to launch metainfo calculation
|
58
|
-
|
59
|
+
# +file_format+: force format type despite file extension. Can be :obo or :json
|
60
|
+
def initialize(file: nil, load_file: false, removable_terms: [], build: true, file_format: nil)
|
59
61
|
# Initialize object variables
|
60
62
|
@header = nil
|
61
63
|
@stanzas = {terms: {}, typedefs: {}, instances: {}}
|
@@ -74,9 +76,20 @@ class Ontology
|
|
74
76
|
@items = {}
|
75
77
|
@removable_terms = []
|
76
78
|
@term_paths = {}
|
77
|
-
# Load if proceeds
|
78
79
|
add_removable_terms(removable_terms) if !removable_terms.empty?
|
79
|
-
|
80
|
+
load_file = true unless file.nil? # This should remove load_file argument, keep it for old scripts
|
81
|
+
# Load if proceeds
|
82
|
+
if load_file
|
83
|
+
fformat = file_format
|
84
|
+
fformat = File.extname(file) if fformat.nil? && !file.nil?
|
85
|
+
if fformat == :obo || fformat == ".obo"
|
86
|
+
load(file, build: build)
|
87
|
+
elsif fformat == :json || fformat == ".json"
|
88
|
+
self.read(file, build: build)
|
89
|
+
elsif !fformat.nil?
|
90
|
+
warn 'Format not allowed. Loading process will not be performed'
|
91
|
+
end
|
92
|
+
end
|
80
93
|
end
|
81
94
|
|
82
95
|
|
@@ -413,31 +426,54 @@ class Ontology
|
|
413
426
|
# +bidirectional+:: calculate bidirectional similitude. Default: false
|
414
427
|
# ===== Return
|
415
428
|
# similitude calculated
|
416
|
-
def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true)
|
429
|
+
def compare(termsA, termsB, sim_type: :resnik, ic_type: :resnik, bidirectional: true, store_mica: false)
|
417
430
|
# Check
|
418
431
|
raise ArgumentError, "Terms sets given are NIL" if termsA.nil? | termsB.nil?
|
419
432
|
raise ArgumentError, "Set given is empty. Aborting similarity calc" if termsA.empty? | termsB.empty?
|
420
433
|
micasA = []
|
421
434
|
# Compare A -> B
|
422
435
|
termsA.each do |tA|
|
423
|
-
micas =
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
436
|
+
micas = []
|
437
|
+
termsB.each do |tB|
|
438
|
+
if store_mica
|
439
|
+
value = @mica_index.dig(tA, tB)
|
440
|
+
else
|
441
|
+
value = nil
|
442
|
+
end
|
443
|
+
if value.nil?
|
444
|
+
value = self.get_similarity(tA, tB, type: sim_type, ic_type: ic_type)
|
445
|
+
if store_mica
|
446
|
+
value = true if value.nil? # We use true to save that the operation was made but there is not mica value
|
447
|
+
add2nestHash(@mica_index, tA, tB, value)
|
448
|
+
end
|
449
|
+
end
|
450
|
+
micas << value if value.class == Float
|
451
|
+
end
|
452
|
+
if !micas.empty?
|
453
|
+
micasA << micas.max # Obtain maximum value
|
454
|
+
else
|
455
|
+
micasA << 0
|
456
|
+
end
|
457
|
+
end
|
458
|
+
means_sim = micasA.inject{ |sum, el| sum + el }.fdiv(micasA.size)
|
431
459
|
# Compare B -> A
|
432
460
|
if bidirectional
|
433
461
|
means_simA = means_sim * micasA.size
|
434
|
-
means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false) * termsB.size
|
435
|
-
means_sim = (means_simA + means_simB)
|
462
|
+
means_simB = self.compare(termsB, termsA, sim_type: sim_type, ic_type: ic_type, bidirectional: false, store_mica: store_mica) * termsB.size
|
463
|
+
means_sim = (means_simA + means_simB).fdiv(termsA.size + termsB.size)
|
436
464
|
end
|
437
465
|
# Return
|
438
466
|
return means_sim
|
439
467
|
end
|
440
468
|
|
469
|
+
def add2nestHash(h, key1, key2, val)
|
470
|
+
query1 = h[key1]
|
471
|
+
if query1.nil?
|
472
|
+
h[key1] = {key2 => val}
|
473
|
+
else
|
474
|
+
query1[key2] = val
|
475
|
+
end
|
476
|
+
end
|
441
477
|
|
442
478
|
# Compare internal stored profiles against another set of profiles. If an external set is not provided, internal profiles will be compared with itself
|
443
479
|
# ===== Parameters
|
@@ -462,12 +498,13 @@ class Ontology
|
|
462
498
|
main_profiles = @profiles
|
463
499
|
end
|
464
500
|
# Compare
|
501
|
+
@mica_index = {}
|
465
502
|
while !main_ids.empty?
|
466
503
|
curr_id = main_ids.shift
|
467
504
|
current_profile = main_profiles[curr_id]
|
468
505
|
comp_ids.each do |id|
|
469
506
|
profile = comp_profiles[id]
|
470
|
-
value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional)
|
507
|
+
value = compare(current_profile, profile, sim_type: sim_type, ic_type: ic_type, bidirectional: bidirectional, store_mica: true)
|
471
508
|
query = profiles_similarity[curr_id]
|
472
509
|
if query.nil?
|
473
510
|
profiles_similarity[curr_id] = {id => value}
|
@@ -485,20 +522,23 @@ class Ontology
|
|
485
522
|
# +alt_tag+:: tag used to expand alternative IDs
|
486
523
|
# ===== Returns
|
487
524
|
# true if process ends without errors and false in other cases
|
488
|
-
def get_index_alternatives(alt_tag: @@basic_tags[:alternative]
|
525
|
+
def get_index_alternatives(alt_tag: @@basic_tags[:alternative].last)
|
489
526
|
# Check input
|
490
527
|
raise('stanzas terms empty') if @stanzas[:terms].empty?
|
491
528
|
# Take all alternative IDs
|
492
529
|
alt_ids2add = {}
|
493
530
|
@stanzas[:terms].each do |id, tags|
|
494
|
-
|
495
|
-
|
496
|
-
alt_ids =
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
531
|
+
if id == tags[:id] # Avoid simulated alternative terms
|
532
|
+
# id = tags[:id] # Take always real ID in case of alternative terms simulted
|
533
|
+
alt_ids = tags[alt_tag]
|
534
|
+
if !alt_ids.nil?
|
535
|
+
alt_ids = alt_ids - @removable_terms - [id]
|
536
|
+
# Update info
|
537
|
+
alt_ids.each do |alt_term|
|
538
|
+
@alternatives_index[alt_term] = id
|
539
|
+
alt_ids2add[alt_term] = @stanzas[:terms][id] if !@stanzas[:terms].include?(alt_term)
|
540
|
+
@ancestors_index[alt_term] = @ancestors_index[id] if !@ancestors_index[id].nil?
|
541
|
+
end
|
502
542
|
end
|
503
543
|
end
|
504
544
|
end
|
@@ -510,10 +550,11 @@ class Ontology
|
|
510
550
|
# ===== Returns
|
511
551
|
# true if eprocess ends without errors and false in other cases
|
512
552
|
def build_index()
|
513
|
-
self.get_index_alternatives
|
514
553
|
self.get_index_obsoletes
|
554
|
+
self.get_index_alternatives
|
515
555
|
self.get_index_child_parent_relations
|
516
556
|
@alternatives_index.map{|k,v| @alternatives_index[k] = self.extract_id(v)}
|
557
|
+
## @alternatives_index.map {|k,v| @alternatives_index[k] = self.stanzas[:terms][v][:id] if k == v} unless self.stanzas[:terms].empty?
|
517
558
|
@alternatives_index.compact!
|
518
559
|
@obsoletes_index.map{|k,v| @obsoletes_index[k] = self.extract_id(v)}
|
519
560
|
@obsoletes_index.compact!
|
@@ -536,8 +577,6 @@ class Ontology
|
|
536
577
|
if @ancestors_index.empty?
|
537
578
|
warn('ancestors_index object is empty')
|
538
579
|
else
|
539
|
-
# Prepare useful variables
|
540
|
-
alternative_terms = @alternatives_index.keys
|
541
580
|
# Per each term, add frequencies
|
542
581
|
@stanzas[:terms].each do |id, tags|
|
543
582
|
if @alternatives_index.include?(id)
|
@@ -556,8 +595,8 @@ class Ontology
|
|
556
595
|
@meta[id] = query
|
557
596
|
end
|
558
597
|
# Store metadata
|
559
|
-
query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc|
|
560
|
-
query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc|
|
598
|
+
query[:ancestors] = @ancestors_index.include?(id) ? @ancestors_index[id].count{|anc| !@alternatives_index.include?(anc)}.to_f : 0.0
|
599
|
+
query[:descendants] = @descendants_index.include?(id) ? @descendants_index[id].count{|desc| !@alternatives_index.include?(desc)}.to_f : 0.0
|
561
600
|
query[:struct_freq] = query[:descendants] + 1.0
|
562
601
|
# Update maximums
|
563
602
|
@max_freqs[:struct_freq] = query[:struct_freq] if @max_freqs[:struct_freq] < query[:struct_freq]
|
@@ -582,6 +621,7 @@ class Ontology
|
|
582
621
|
# Check obsoletes
|
583
622
|
@stanzas[:terms].each do |id, term_tags|
|
584
623
|
next if term_tags.nil?
|
624
|
+
next if self.is_alternative?(id)
|
585
625
|
query = term_tags[obs_tag]
|
586
626
|
if !query.nil? && query == 'true' # Obsolete tag presence
|
587
627
|
next if !@obsoletes_index[id].nil? # Already stored
|
@@ -633,10 +673,10 @@ class Ontology
|
|
633
673
|
end
|
634
674
|
end
|
635
675
|
# Store alternatives
|
636
|
-
@alternatives_index.each do |id,alt|
|
637
|
-
|
638
|
-
|
639
|
-
end
|
676
|
+
# @alternatives_index.each do |id,alt|
|
677
|
+
# anc[id] = anc[alt] if anc.include?(alt)
|
678
|
+
# des[id] = des[alt] if des.include?(alt)
|
679
|
+
# end
|
640
680
|
# Check structure
|
641
681
|
if ![:atomic,:sparse].include? structType
|
642
682
|
structType = structType == :circular ? :circular : :hierarchical
|
@@ -704,12 +744,14 @@ class Ontology
|
|
704
744
|
# the IC calculated
|
705
745
|
def get_IC(termRaw, type: :resnik, force: false, zhou_k: 0.5)
|
706
746
|
term = termRaw.to_sym
|
747
|
+
curr_ics = @ics[type]
|
707
748
|
# Check
|
708
749
|
raise ArgumentError, "IC type specified (#{type}) is not allowed" if !@@allowed_calcs[:ics].include?(type)
|
709
750
|
# Check if it's already calculated
|
710
|
-
return
|
751
|
+
return curr_ics[term] if (curr_ics.include? term) && !force
|
711
752
|
# Calculate
|
712
753
|
ic = - 1
|
754
|
+
term_meta = @meta[term]
|
713
755
|
case type # https://arxiv.org/ftp/arxiv/papers/1310/1310.8059.pdf ||| https://sci-hub.st/https://doi.org/10.1016/j.eswa.2012.01.082
|
714
756
|
###########################################
|
715
757
|
#### STRUCTURE BASED METRICS
|
@@ -726,10 +768,10 @@ class Ontology
|
|
726
768
|
###########################################
|
727
769
|
when :resnik # Resnik P: Using Information Content to Evaluate Semantic Similarity in a Taxonomy
|
728
770
|
# -log(Freq(x) / Max_Freq)
|
729
|
-
ic = -Math.log10(
|
771
|
+
ic = -Math.log10(term_meta[:struct_freq].fdiv(@max_freqs[:struct_freq]))
|
730
772
|
when :resnik_observed
|
731
773
|
# -log(Freq(x) / Max_Freq)
|
732
|
-
ic = -Math.log10(
|
774
|
+
ic = -Math.log10(term_meta[:observed_freq].fdiv(@max_freqs[:observed_freq]))
|
733
775
|
# Lin
|
734
776
|
# Jiang & Conrath
|
735
777
|
|
@@ -745,17 +787,17 @@ class Ontology
|
|
745
787
|
###########################################
|
746
788
|
when :seco, :zhou # SECO:: An intrinsic information content metric for semantic similarity in WordNet
|
747
789
|
# 1 - ( log(hypo(x) + 1) / log(max_nodes) )
|
748
|
-
ic = 1 - Math.log10(
|
790
|
+
ic = 1 - Math.log10(term_meta[:struct_freq]).fdiv(Math.log10(@stanzas[:terms].length - @alternatives_index.length))
|
749
791
|
if :zhou # New Model of Semantic Similarity Measuring in Wordnet
|
750
792
|
# k*(IC_Seco(x)) + (1-k)*(log(depth(x))/log(max_depth))
|
751
793
|
@ics[:seco][term] = ic # Special store
|
752
|
-
ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(
|
794
|
+
ic = zhou_k * ic + (1.0 - zhou_k) * (Math.log10(term_meta[:descendants]).fdiv(Math.log10(@max_freqs[:max_depth])))
|
753
795
|
end
|
754
796
|
when :sanchez # Semantic similarity estimation in the biomedical domain: An ontology-basedinformation-theoretic perspective
|
755
|
-
ic = -Math.log10((
|
797
|
+
ic = -Math.log10((term_meta[:descendants].fdiv(term_meta[:ancestors]) + 1.0).fdiv(@max_freqs[:max_depth] + 1.0))
|
756
798
|
# Knappe
|
757
799
|
end
|
758
|
-
|
800
|
+
curr_ics[term] = ic
|
759
801
|
return ic
|
760
802
|
end
|
761
803
|
|
@@ -788,8 +830,8 @@ class Ontology
|
|
788
830
|
# ===== Returns
|
789
831
|
# the IC of the MICA(termA,termB)
|
790
832
|
def get_ICMICA(termA, termB, ic_type = :resnik)
|
791
|
-
|
792
|
-
return
|
833
|
+
term, ic = self.get_MICA(termA, termB, ic_type)
|
834
|
+
return term.nil? ? nil : ic
|
793
835
|
end
|
794
836
|
|
795
837
|
|
@@ -812,19 +854,12 @@ class Ontology
|
|
812
854
|
# Obtain ancestors (include itselfs too)
|
813
855
|
anc_A = self.get_ancestors(termA)
|
814
856
|
anc_B = self.get_ancestors(termB)
|
815
|
-
|
816
857
|
if !(anc_A.empty? && anc_B.empty?)
|
817
858
|
anc_A << termA
|
818
859
|
anc_B << termB
|
819
|
-
# Find shared ancestors
|
820
|
-
|
821
|
-
|
822
|
-
if shared_ancestors.length > 0
|
823
|
-
shared_ancestors.each do |anc|
|
824
|
-
ic = self.get_IC(anc, type: ic_type)
|
825
|
-
# Check
|
826
|
-
mica = [anc,ic] if ic > mica[1]
|
827
|
-
end
|
860
|
+
(anc_A & anc_B).each do |anc| # Find MICA in shared ancestors
|
861
|
+
ic = self.get_IC(anc, type: ic_type)
|
862
|
+
mica = [anc,ic] if ic > mica[1]
|
828
863
|
end
|
829
864
|
end
|
830
865
|
end
|
@@ -844,9 +879,8 @@ class Ontology
|
|
844
879
|
# Check
|
845
880
|
raise ArgumentError, "SIM type specified (#{type}) is not allowed" if !@@allowed_calcs[:sims].include?(type)
|
846
881
|
sim = nil
|
847
|
-
|
848
|
-
|
849
|
-
if !sim_res.nil?
|
882
|
+
mica, sim_res = get_MICA(termA, termB, ic_type)
|
883
|
+
if !mica.nil?
|
850
884
|
case type
|
851
885
|
when :resnik
|
852
886
|
sim = sim_res
|
@@ -922,6 +956,16 @@ class Ontology
|
|
922
956
|
jsonFile = File.open(file)
|
923
957
|
jsonInfo = JSON.parse(jsonFile.read, :symbolize_names => true)
|
924
958
|
# Pre-process (Symbolize some hashs values)
|
959
|
+
if !jsonInfo[:header].nil?
|
960
|
+
aux = jsonInfo[:header].map do |entry,info|
|
961
|
+
if info.kind_of?(Array) && @@symbolizable_ids.include?(entry)
|
962
|
+
[entry,info.map{|item| item.to_sym}]
|
963
|
+
else
|
964
|
+
[entry,info]
|
965
|
+
end
|
966
|
+
end
|
967
|
+
jsonInfo[:header] = aux.to_h
|
968
|
+
end
|
925
969
|
jsonInfo[:stanzas][:terms].map{|id,info| self.class.symbolize_ids(info)} # STANZAS
|
926
970
|
jsonInfo[:stanzas][:typedefs].map{|id,info| self.class.symbolize_ids(info)}
|
927
971
|
jsonInfo[:stanzas][:instances].map{|id,info| self.class.symbolize_ids(info)}
|
@@ -1106,7 +1150,7 @@ class Ontology
|
|
1106
1150
|
if checked.nil?
|
1107
1151
|
t
|
1108
1152
|
else
|
1109
|
-
byValue[checked] = byValue.delete(t) if checked != t &&
|
1153
|
+
byValue[checked] = byValue.delete(t) if checked != t && byValue[checked].nil? # Update in byValue
|
1110
1154
|
checked
|
1111
1155
|
end
|
1112
1156
|
end
|
@@ -1134,7 +1178,8 @@ class Ontology
|
|
1134
1178
|
else
|
1135
1179
|
aux = self.extract_id(referenceValue)
|
1136
1180
|
end
|
1137
|
-
|
1181
|
+
aux.compact! unless aux.nil?
|
1182
|
+
referenceValue = aux unless aux.nil?
|
1138
1183
|
end
|
1139
1184
|
referenceValue = [referenceValue] if !referenceValue.kind_of?(Array)
|
1140
1185
|
byTerm[term] = referenceValue + (values - referenceValue)
|
@@ -1525,6 +1570,7 @@ class Ontology
|
|
1525
1570
|
# ===== Returns
|
1526
1571
|
# cleaned profile
|
1527
1572
|
def clean_profile(profile, remove_alternatives: true)
|
1573
|
+
warn('Estructure is circular, behaviour could not be which is expected') if @structureType == :circular
|
1528
1574
|
terms_without_ancestors, _ = self.remove_ancestors_from_profile(profile)
|
1529
1575
|
if remove_alternatives
|
1530
1576
|
terms_without_ancestors_and_alternatices, _ = self.remove_alternatives_from_profile(terms_without_ancestors)
|
@@ -1534,6 +1580,43 @@ class Ontology
|
|
1534
1580
|
return terms_without_ancestors_and_alternatices
|
1535
1581
|
end
|
1536
1582
|
|
1583
|
+
def clean_profile_hard(profile)
|
1584
|
+
profile, _ = check_ids(profile)
|
1585
|
+
profile = profile.select{|t| !is_obsolete?(t)}
|
1586
|
+
profile = clean_profile(profile.uniq)
|
1587
|
+
return profile
|
1588
|
+
end
|
1589
|
+
|
1590
|
+
# Remove terms from a given profile using hierarchical info and scores set given
|
1591
|
+
# ===== Parameters
|
1592
|
+
# +profile+:: profile to be cleaned
|
1593
|
+
# +scores+:: hash with terms by keys and numerical values (scores)
|
1594
|
+
# +byMax+:: if true, maximum scored term will be keeped, if false, minimum will be keeped
|
1595
|
+
# +remove_without_score+:: if true, terms without score will be removed. Default: true
|
1596
|
+
# ===== Returns
|
1597
|
+
# cleaned profile
|
1598
|
+
def clean_profile_by_score(profile, scores, byMax: true, remove_without_score: true)
|
1599
|
+
scores = scores.sort_by{|term,score| score}.to_h
|
1600
|
+
keep = profile.map do |term|
|
1601
|
+
if scores.include?(term)
|
1602
|
+
parentals = [self.get_ancestors(term), self.get_descendants(term)].flatten
|
1603
|
+
targetable = parentals.select{|parent| profile.include?(parent)}
|
1604
|
+
if targetable.empty?
|
1605
|
+
term
|
1606
|
+
else
|
1607
|
+
targetable << term
|
1608
|
+
targets = scores.select{|term,score| targetable.include?(term)}.to_h
|
1609
|
+
byMax ? targets.keys.last : targets.keys.first
|
1610
|
+
end
|
1611
|
+
elsif remove_without_score
|
1612
|
+
nil
|
1613
|
+
else
|
1614
|
+
term
|
1615
|
+
end
|
1616
|
+
end
|
1617
|
+
return keep.compact.uniq
|
1618
|
+
end
|
1619
|
+
|
1537
1620
|
|
1538
1621
|
# Remove alternatives (if official term is present) and ancestors terms of stored profiles
|
1539
1622
|
# ===== Parameters
|
@@ -1635,44 +1718,45 @@ class Ontology
|
|
1635
1718
|
|
1636
1719
|
# Find paths of a term following it ancestors and stores all possible paths for it and it's parentals.
|
1637
1720
|
# Also calculates paths metadata and stores into @term_paths
|
1638
|
-
def calc_term_paths
|
1639
|
-
self.calc_ancestors_dictionary if
|
1640
|
-
visited_terms =
|
1721
|
+
def calc_term_paths(only_main_terms=false)
|
1722
|
+
self.calc_ancestors_dictionary if @dicts[:is_a].nil? # Calculate direct parentals dictionary if it's not already calculated
|
1723
|
+
visited_terms = {} # PEDRO: To keep track of visited data, hash accesions are fast than array includes. I don't understant why use this variable instead of check @term_paths to see if the data is calculated
|
1641
1724
|
@term_paths = {}
|
1642
1725
|
if [:hierarchical, :sparse].include? @structureType
|
1643
|
-
|
1644
|
-
|
1645
|
-
if self.is_obsolete?(term) || self.is_alternative?(term) # Special case (obsoletes)
|
1726
|
+
@stanzas[:terms].each do |term, t_attributes|
|
1727
|
+
if !only_main_terms && (self.is_obsolete?(term) || self.is_alternative?(term)) # Special case (obsoletes)
|
1646
1728
|
special_term = term
|
1647
1729
|
term = self.is_obsolete?(term) ? @obsoletes_index[term] : @alternatives_index[term]
|
1648
1730
|
@term_paths[term] = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} if !@term_paths.include?(term)
|
1649
1731
|
@term_paths[special_term] = @term_paths[term]
|
1650
|
-
visited_terms
|
1732
|
+
visited_terms[special_term] = true
|
1651
1733
|
end
|
1652
|
-
|
1653
1734
|
if !visited_terms.include?(term)
|
1654
|
-
|
1735
|
+
# PEDRO: This code is very similar to expand_path method, but cannot be replaced by it (test fail). We must work to use this method here
|
1736
|
+
path_attr = @term_paths[term]
|
1737
|
+
if path_attr.nil?
|
1738
|
+
path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []} # create new path data
|
1739
|
+
@term_paths[term] = path_attr #save path data container
|
1740
|
+
end
|
1655
1741
|
parentals = @dicts[:is_a][:byTerm][term]
|
1656
1742
|
if parentals.nil?
|
1657
|
-
|
1743
|
+
path_attr[:paths] << [term]
|
1658
1744
|
else
|
1659
1745
|
parentals.each do |direct_parental|
|
1660
|
-
|
1661
|
-
|
1662
|
-
|
1663
|
-
self.expand_path(direct_parental, visited_terms)
|
1664
|
-
new_paths = @term_paths[direct_parental][:paths].map{|path| [term, path].flatten}
|
1665
|
-
end
|
1666
|
-
new_paths.each{|path| @term_paths[term][:paths] << path}
|
1746
|
+
self.expand_path(direct_parental)
|
1747
|
+
new_paths = @term_paths[direct_parental][:paths]
|
1748
|
+
path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(term)})
|
1667
1749
|
end
|
1668
|
-
end
|
1669
|
-
visited_terms
|
1750
|
+
end
|
1751
|
+
anc = @ancestors_index[term].each{|anc| visited_terms[anc] = true} if @ancestors_index.include?(term)
|
1752
|
+
visited_terms[term] = true
|
1670
1753
|
end
|
1671
1754
|
# Update metadata
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1755
|
+
path_attr = @term_paths[term]
|
1756
|
+
path_attr[:total_paths] = path_attr[:paths].length
|
1757
|
+
paths_sizes = path_attr[:paths].map{|path| path.length}
|
1758
|
+
path_attr[:largest_path] = paths_sizes.max
|
1759
|
+
path_attr[:shortest_path] = paths_sizes.min
|
1676
1760
|
end
|
1677
1761
|
else
|
1678
1762
|
warn('Ontology structure must be hierarchical or sparse to calculate term levels. Aborting paths calculation')
|
@@ -1684,20 +1768,25 @@ class Ontology
|
|
1684
1768
|
# ===== Parameters
|
1685
1769
|
# +curr_term+:: current visited term
|
1686
1770
|
# +visited_terms+:: already expanded terms
|
1687
|
-
def expand_path(curr_term
|
1688
|
-
if
|
1689
|
-
|
1771
|
+
def expand_path(curr_term)
|
1772
|
+
if !@term_paths.include?(curr_term)
|
1773
|
+
path_attr = {total_paths: 0, largest_path: 0, shortest_path: 0, paths: []}
|
1774
|
+
@term_paths[curr_term] = path_attr
|
1690
1775
|
direct_parentals = @dicts[:is_a][:byTerm][curr_term]
|
1691
1776
|
if direct_parentals.nil? # No parents :: End of recurrence
|
1692
|
-
|
1777
|
+
path_attr[:paths] << [curr_term]
|
1693
1778
|
else # Expand and concat
|
1694
1779
|
direct_parentals.each do |ancestor|
|
1695
|
-
|
1696
|
-
|
1697
|
-
|
1780
|
+
path_attr_parental = @term_paths[ancestor]
|
1781
|
+
if path_attr_parental.nil? # Calculate new paths
|
1782
|
+
self.expand_path(ancestor)
|
1783
|
+
new_paths = @term_paths[ancestor][:paths]
|
1784
|
+
else # Use direct_parental paths already calculated
|
1785
|
+
new_paths = path_attr_parental[:paths]
|
1786
|
+
end
|
1787
|
+
path_attr[:paths].concat(new_paths.map{|path| path.clone.unshift(curr_term)})
|
1698
1788
|
end
|
1699
1789
|
end
|
1700
|
-
visited_terms << curr_term
|
1701
1790
|
end
|
1702
1791
|
end
|
1703
1792
|
|
@@ -1717,6 +1806,26 @@ class Ontology
|
|
1717
1806
|
return @dicts[:level][:byValue][term]
|
1718
1807
|
end
|
1719
1808
|
|
1809
|
+
# nil, term not found, [] term exists but not has parents
|
1810
|
+
def get_parental_path(term, which_path = :shortest_path, level = 0)
|
1811
|
+
path = nil
|
1812
|
+
path_attr = @term_paths[term]
|
1813
|
+
if !path_attr.nil?
|
1814
|
+
path_length = path_attr[which_path]
|
1815
|
+
all_paths = path_attr[:paths]
|
1816
|
+
if all_paths.empty?
|
1817
|
+
path = []
|
1818
|
+
else
|
1819
|
+
path = all_paths.select{|pt| pt.length == path_length}.first.clone
|
1820
|
+
if level > 0 # we want the term and his ascendants until a specific level
|
1821
|
+
n_parents = path_length - level
|
1822
|
+
path = path[0..n_parents]
|
1823
|
+
end
|
1824
|
+
path.shift # Discard the term itself
|
1825
|
+
end
|
1826
|
+
end
|
1827
|
+
return path
|
1828
|
+
end
|
1720
1829
|
|
1721
1830
|
# Return ontology levels from profile terms
|
1722
1831
|
# ===== Returns
|
@@ -1737,6 +1846,83 @@ class Ontology
|
|
1737
1846
|
return levels_filtered
|
1738
1847
|
end
|
1739
1848
|
|
1849
|
+
def get_profile_ontology_distribution_tables
|
1850
|
+
cohort_ontology_levels = get_ontology_levels_from_profiles(uniq=false)
|
1851
|
+
uniq_cohort_ontology_levels = get_ontology_levels_from_profiles
|
1852
|
+
hpo_ontology_levels = get_ontology_levels
|
1853
|
+
total_ontology_terms = hpo_ontology_levels.values.flatten.length
|
1854
|
+
total_cohort_terms = cohort_ontology_levels.values.flatten.length
|
1855
|
+
total_uniq_cohort_terms = uniq_cohort_ontology_levels.values.flatten.length
|
1856
|
+
|
1857
|
+
ontology_levels = []
|
1858
|
+
distribution_percentage = []
|
1859
|
+
hpo_ontology_levels.each do |level, terms|
|
1860
|
+
cohort_terms = cohort_ontology_levels[level]
|
1861
|
+
uniq_cohort_terms = uniq_cohort_ontology_levels[level]
|
1862
|
+
if cohort_terms.nil? || uniq_cohort_terms.nil?
|
1863
|
+
num = 0
|
1864
|
+
u_num = 0
|
1865
|
+
else
|
1866
|
+
num = cohort_terms.length
|
1867
|
+
u_num = uniq_cohort_terms.length
|
1868
|
+
end
|
1869
|
+
ontology_levels << [level, terms.length, num]
|
1870
|
+
distribution_percentage << [
|
1871
|
+
level,
|
1872
|
+
(terms.length.fdiv(total_ontology_terms)*100).round(3),
|
1873
|
+
(num.fdiv(total_cohort_terms)*100).round(3),
|
1874
|
+
(u_num.fdiv(total_uniq_cohort_terms)*100).round(3)
|
1875
|
+
]
|
1876
|
+
end
|
1877
|
+
ontology_levels.sort! { |x,y| x.first <=> y.first }
|
1878
|
+
distribution_percentage.sort! { |x,y| x.first <=> y.first }
|
1879
|
+
return ontology_levels, distribution_percentage
|
1880
|
+
end
|
1881
|
+
|
1882
|
+
def get_dataset_specifity_index(mode)
|
1883
|
+
ontology_levels, distribution_percentage = get_profile_ontology_distribution_tables
|
1884
|
+
if mode == 'uniq'
|
1885
|
+
observed_distribution = 3
|
1886
|
+
elsif mode == 'weigthed'
|
1887
|
+
observed_distribution = 2
|
1888
|
+
end
|
1889
|
+
max_terms = distribution_percentage.map{|row| row[1]}.max
|
1890
|
+
maxL = nil
|
1891
|
+
distribution_percentage.each do |level_info|
|
1892
|
+
maxL = level_info.first if level_info[1] == max_terms
|
1893
|
+
end
|
1894
|
+
diffL = distribution_percentage.map{|l| [l[0], l[observed_distribution] - l[1]]}
|
1895
|
+
diffL.select!{|dL| dL.last > 0}
|
1896
|
+
lowSection = diffL.select{|dL| dL.first <= maxL}
|
1897
|
+
highSection = diffL.select{|dL| dL.first > maxL}
|
1898
|
+
dsi = nil
|
1899
|
+
if highSection.empty?
|
1900
|
+
dsi = 0
|
1901
|
+
else
|
1902
|
+
accumulated_weigth = 0
|
1903
|
+
accumulated_weigthed_diffL = 0
|
1904
|
+
hss = get_weigthed_level_contribution(highSection, maxL, ontology_levels.length - maxL)
|
1905
|
+
lss = get_weigthed_level_contribution(lowSection, maxL, maxL)
|
1906
|
+
dsi = hss.fdiv(lss)
|
1907
|
+
end
|
1908
|
+
return dsi
|
1909
|
+
end
|
1910
|
+
|
1911
|
+
def get_weigthed_level_contribution(section, maxL, nLevels)
|
1912
|
+
accumulated_weigthed_diffL = 0
|
1913
|
+
section.each do |level, diff|
|
1914
|
+
weightL = maxL - level
|
1915
|
+
if weightL >= 0
|
1916
|
+
weightL += 1
|
1917
|
+
else
|
1918
|
+
weightL = weightL.abs
|
1919
|
+
end
|
1920
|
+
accumulated_weigthed_diffL += diff * weightL
|
1921
|
+
end
|
1922
|
+
weigthed_contribution = accumulated_weigthed_diffL.fdiv(nLevels)
|
1923
|
+
return weigthed_contribution
|
1924
|
+
end
|
1925
|
+
|
1740
1926
|
|
1741
1927
|
# Calculate profiles dictionary with Key= Term; Value = Profiles
|
1742
1928
|
def calc_profiles_dictionary
|
@@ -1808,17 +1994,66 @@ class Ontology
|
|
1808
1994
|
end
|
1809
1995
|
end
|
1810
1996
|
if expand
|
1811
|
-
|
1812
|
-
|
1813
|
-
|
1814
|
-
|
1815
|
-
|
1816
|
-
|
1817
|
-
|
1997
|
+
@items = self.concatItems(@items,relations)
|
1998
|
+
# relations.each do |k,v| # MUST UPDATE THIS USING A CONCAT SPECIFIC FUNCTION
|
1999
|
+
# if @items.keys.include?(k)
|
2000
|
+
# if v.kind_of?(Array)
|
2001
|
+
# @items[k] = (@items[k] + v).uniq
|
2002
|
+
# elsif v.kind_of?(Hash)
|
2003
|
+
# @items.merge!(relations) do |k, oldV, newV|
|
2004
|
+
# if oldV.kind_of?(Array)
|
2005
|
+
# return (oldV + newV).uniq
|
2006
|
+
# else
|
2007
|
+
# oldV = [oldV,newV]
|
2008
|
+
# end
|
2009
|
+
# end
|
2010
|
+
# elsif @items[k].kind_of?(Array) # We suppose a single value/object from here
|
2011
|
+
# @items[k] = (@items[k] + [v]).uniq
|
2012
|
+
# else
|
2013
|
+
# @items[k] = [@items[k],v]
|
2014
|
+
# end
|
2015
|
+
# else
|
2016
|
+
# @items[k] = v
|
2017
|
+
# end
|
2018
|
+
# end
|
1818
2019
|
else
|
1819
2020
|
@items.merge!(relations)
|
1820
2021
|
end
|
1821
|
-
end
|
2022
|
+
end
|
2023
|
+
|
2024
|
+
# Internal function to concat two elements.
|
2025
|
+
# ===== Parameters
|
2026
|
+
# +itemA+:: item to be concatenated
|
2027
|
+
# +itemB+:: item to be concatenated
|
2028
|
+
# ===== Returns
|
2029
|
+
# Concatenated objects
|
2030
|
+
def concatItems(itemA,itemB)
|
2031
|
+
# A is Array :: RETURN ARRAY
|
2032
|
+
# A_array : B_array
|
2033
|
+
# A_array : B_hash => NOT ALLOWED
|
2034
|
+
# A_array : B_single => NOT ALLOWED
|
2035
|
+
# A is Hash :: RETURN HASH
|
2036
|
+
# A_hash : B_array => NOT ALLOWED
|
2037
|
+
# A_hash : B_hash
|
2038
|
+
# A_hash : B_single => NOT ALLOWED
|
2039
|
+
# A is single element => RETURN ARRAY
|
2040
|
+
# A_single : B_array
|
2041
|
+
# A_single : B_hash => NOT ALLOWED
|
2042
|
+
# A_single : B_single
|
2043
|
+
concatenated = nil
|
2044
|
+
if itemA.kind_of?(Array) && itemB.kind_of?(Array)
|
2045
|
+
concatenated = (itemA + itemB).uniq
|
2046
|
+
elsif itemA.kind_of?(Hash) && itemB.kind_of?(Hash)
|
2047
|
+
concatenated = itemA.merge(itemB) do |k, oldV, newV|
|
2048
|
+
self.concatItems(oldV,newV)
|
2049
|
+
end
|
2050
|
+
elsif itemB.kind_of?(Array)
|
2051
|
+
concatenated = ([itemA] + itemB).uniq
|
2052
|
+
elsif ![Array, Hash].include?(itemB.class)
|
2053
|
+
concatenated = [itemA,itemB].uniq
|
2054
|
+
end
|
2055
|
+
return concatenated
|
2056
|
+
end
|
1822
2057
|
|
1823
2058
|
|
1824
2059
|
# Assign a dictionary already calculated as a items set.
|
@@ -1826,7 +2061,7 @@ class Ontology
|
|
1826
2061
|
# +dictID+:: dictionary ID to be stored (:byTerm will be used)
|
1827
2062
|
def set_items_from_dict(dictID, remove_old_relations = false)
|
1828
2063
|
@items = {} if remove_old_relations
|
1829
|
-
if
|
2064
|
+
if !@dicts[dictID].nil?
|
1830
2065
|
@items.merge(@dicts[dictID][:byTerm])
|
1831
2066
|
else
|
1832
2067
|
warn('Specified ID is not calculated. Dict will not be added as a items set')
|
@@ -1875,7 +2110,7 @@ class Ontology
|
|
1875
2110
|
curr_keys.map do |term_expand|
|
1876
2111
|
to_infer = []
|
1877
2112
|
# Obtain childs
|
1878
|
-
childs = self.get_descendants(term_expand,true).select{|t|
|
2113
|
+
childs = self.get_descendants(term_expand,true).select{|t| !@items[t].nil?}
|
1879
2114
|
# Expand
|
1880
2115
|
if childs.length > 0 && minimum_childs == 1 # Special case
|
1881
2116
|
to_infer = childs.map{|c| @items[c]}.flatten.compact.uniq
|
@@ -1931,40 +2166,172 @@ class Ontology
|
|
1931
2166
|
end
|
1932
2167
|
|
1933
2168
|
|
2169
|
+
# Return direct ancestors/descendants of a given term
|
2170
|
+
# ===== Parameters
|
2171
|
+
# +term+:: which are requested
|
2172
|
+
# +relation+:: can be :ancestor or :descendant
|
2173
|
+
# +remove_alternatives+:: if true, alternatives will be removed
|
2174
|
+
# ===== Returns
|
2175
|
+
# Direct ancestors/descendants of given term or nil if any error occurs
|
2176
|
+
def get_direct_related(term, relation, remove_alternatives: false)
|
2177
|
+
if @dicts[:is_a].nil?
|
2178
|
+
warn("Hierarchy dictionary is not already calculated. Returning nil")
|
2179
|
+
return nil
|
2180
|
+
end
|
2181
|
+
target = nil
|
2182
|
+
case relation
|
2183
|
+
when :ancestor
|
2184
|
+
target = :byTerm
|
2185
|
+
when :descendant
|
2186
|
+
target = :byValue
|
2187
|
+
else
|
2188
|
+
warn('Relation type not allowed. Returning nil')
|
2189
|
+
end
|
2190
|
+
return nil if target.nil?
|
2191
|
+
query = @dicts[:is_a][target][term]
|
2192
|
+
return query if query.nil?
|
2193
|
+
query, _ = remove_alternatives_from_profile(query) if remove_alternatives
|
2194
|
+
return query
|
2195
|
+
end
|
2196
|
+
|
2197
|
+
|
2198
|
+
# Return direct ancestors of a given term
|
2199
|
+
# ===== Parameters
|
2200
|
+
# +term+:: which ancestors are requested
|
2201
|
+
# +remove_alternatives+:: if true, alternatives will be removed
|
2202
|
+
# ===== Returns
|
2203
|
+
# Direct ancestors of given term or nil if any error occurs
|
2204
|
+
def get_direct_ancentors(term, remove_alternatives: false)
|
2205
|
+
return self.get_direct_related(term, :ancestor, remove_alternatives: remove_alternatives)
|
2206
|
+
end
|
2207
|
+
|
2208
|
+
# Return direct descendants of a given term
|
2209
|
+
# ===== Parameters
|
2210
|
+
# +term+:: which descendants are requested
|
2211
|
+
# +remove_alternatives+:: if true, alternatives will be removed
|
2212
|
+
# ===== Returns
|
2213
|
+
# Direct descendants of given term or nil if any error occurs
|
2214
|
+
def get_direct_descendants(term, remove_alternatives: false)
|
2215
|
+
return self.get_direct_related(term, :descendant, remove_alternatives: remove_alternatives)
|
2216
|
+
end
|
2217
|
+
|
2218
|
+
|
2219
|
+
|
2220
|
+
#============================================================================
|
2221
|
+
#============================================================================
|
1934
2222
|
|
1935
2223
|
# NO IDEA WHAT THIS DOES. DON'T USE THIS METHODS IS NOT CHECKED
|
1936
2224
|
# ===== Parameters
|
1937
2225
|
# ++::
|
1938
2226
|
# ===== Returns
|
1939
2227
|
# ...
|
1940
|
-
def compute_relations_to_items(external_item_list, mode, thresold)
|
2228
|
+
def compute_relations_to_items(external_item_list, total_items, mode, thresold)
|
2229
|
+
terms_levels = list_terms_per_level_from_items
|
2230
|
+
#puts terms_levels.inspect.yellow
|
2231
|
+
connect_familiars!(terms_levels)
|
2232
|
+
#puts terms_levels.inspect.blue
|
2233
|
+
item_list_with_transf_parental = get_item_list_parental(terms_levels)
|
2234
|
+
results = []
|
2235
|
+
if mode == :elim
|
2236
|
+
results = compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list_with_transf_parental)
|
2237
|
+
elsif mode == :weight
|
2238
|
+
results = compute_relations_weight(terms_levels, external_item_list, total_items, item_list_with_transf_parental)
|
2239
|
+
end
|
2240
|
+
return results
|
2241
|
+
end
|
2242
|
+
|
2243
|
+
def get_item_list_parental(terms_levels)
|
2244
|
+
transfered_list = {}
|
2245
|
+
parent_dict = @dicts[:is_a][:byTerm]
|
2246
|
+
levels = terms_levels.keys.sort
|
2247
|
+
while levels.length > 1
|
2248
|
+
level = levels.pop
|
2249
|
+
terms_levels[level].each do |term|
|
2250
|
+
parents = parent_dict[term]
|
2251
|
+
if parents.nil?
|
2252
|
+
next
|
2253
|
+
elsif parents.length == 1
|
2254
|
+
parent = parents.first
|
2255
|
+
else
|
2256
|
+
parent = (parents | terms_levels[level - 1]).first
|
2257
|
+
end
|
2258
|
+
term_it = @items[term]
|
2259
|
+
parent_it = @items[parent]
|
2260
|
+
curr_it = transfered_list[term]
|
2261
|
+
parent_all_items = merge_groups([term_it, parent_it, curr_it])
|
2262
|
+
transfered_list[parent] = parent_all_items if !parent_all_items.empty?
|
2263
|
+
term_all_items = merge_groups([term_it, curr_it])
|
2264
|
+
transfered_list[term] = term_all_items if !term_all_items.empty?
|
2265
|
+
end
|
2266
|
+
end
|
2267
|
+
terms_levels[levels.first].each do |term| # Rescue lower level terms that not have children so they cannot receive items
|
2268
|
+
transfered_list[term] = @items[term] if transfered_list[term].nil?
|
2269
|
+
end
|
2270
|
+
return transfered_list
|
2271
|
+
end
|
2272
|
+
|
2273
|
+
def merge_groups(groups)
|
2274
|
+
return groups.compact.inject([]){|it, a| it | a}
|
2275
|
+
end
|
2276
|
+
|
2277
|
+
def list_terms_per_level_from_items
|
2278
|
+
terms_levels = {}
|
2279
|
+
@items.each do |term, items|
|
2280
|
+
level = self.get_term_level(term)
|
2281
|
+
query = terms_levels[level]
|
2282
|
+
if query.nil?
|
2283
|
+
terms_levels[level] = [term]
|
2284
|
+
else
|
2285
|
+
query << term
|
2286
|
+
end
|
2287
|
+
end
|
2288
|
+
return terms_levels
|
2289
|
+
end
|
2290
|
+
|
2291
|
+
def connect_familiars!(terms_levels)
|
2292
|
+
levels = terms_levels.keys.sort
|
2293
|
+
while levels.length > 1 # Process when current level has a parental level
|
2294
|
+
level = levels.pop
|
2295
|
+
parental_level = level - 1
|
2296
|
+
parental_terms = terms_levels[parental_level]
|
2297
|
+
if parental_terms.nil? # The queried parent level not exists but there is a parental level above of the non existant
|
2298
|
+
parental_terms = [] # Initialize required parental level
|
2299
|
+
terms_levels[parental_level] = parental_terms
|
2300
|
+
levels << parental_level
|
2301
|
+
end
|
2302
|
+
terms_levels[level].each do |term|
|
2303
|
+
path_info = @term_paths[term]
|
2304
|
+
shortest_path_length = path_info[:shortest_path]
|
2305
|
+
path = path_info[:paths].select{|p| p.length == shortest_path_length}.first
|
2306
|
+
parental = path[1] # the first elements is the term itself
|
2307
|
+
parental_terms << parental if !parental_terms.include?(parental)
|
2308
|
+
end
|
2309
|
+
end
|
2310
|
+
end
|
2311
|
+
|
2312
|
+
def compute_relations_elim(terms_levels, external_item_list, total_items, thresold, item_list)
|
1941
2313
|
results = []
|
1942
2314
|
penalized_terms = {}
|
1943
|
-
# terms_levels = get_terms_levels(@items_relations.keys)
|
1944
|
-
terms_with_items_levels = @items_relations.keys.map{|term| self.get_term_level(term)}.uniq
|
1945
|
-
terms_levels = self.get_ontology_levels().select{|k,v| terms_with_items_levels.include?(k)}
|
1946
|
-
terms_levels = terms_levels.each{|level,terms| [level, terms.select{|t| @items_relations.keys.include?(t)}] } # Use only items terms. MAYBE IT'S NOT OUR TARGET (line added by fmj)
|
1947
2315
|
levels = terms_levels.keys.sort
|
1948
2316
|
levels.reverse_each do |level|
|
1949
2317
|
terms_levels[level].each do |term|
|
1950
|
-
associated_items =
|
1951
|
-
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1955
|
-
|
1956
|
-
|
1957
|
-
|
1958
|
-
|
1959
|
-
|
1960
|
-
|
1961
|
-
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1965
|
-
|
1966
|
-
|
1967
|
-
end
|
2318
|
+
associated_items = item_list[term]
|
2319
|
+
items_to_remove = penalized_terms[term]
|
2320
|
+
items_to_remove = [] if items_to_remove.nil?
|
2321
|
+
pval = get_fisher_exact_test(
|
2322
|
+
external_item_list - items_to_remove,
|
2323
|
+
associated_items - items_to_remove,
|
2324
|
+
#((associated_items | external_item_list) - items_to_remove).length
|
2325
|
+
total_items
|
2326
|
+
)
|
2327
|
+
if pval <= thresold
|
2328
|
+
parents = get_ancestors(term) # Save the items for each parent term to remove them later in the fisher test
|
2329
|
+
parents.each do |prnt|
|
2330
|
+
query = penalized_terms[prnt]
|
2331
|
+
if query.nil?
|
2332
|
+
penalized_terms[prnt] = item_list[term].clone # We need a new array to store the following iterations
|
2333
|
+
else
|
2334
|
+
query.concat(item_list[term])
|
1968
2335
|
end
|
1969
2336
|
end
|
1970
2337
|
end
|
@@ -1974,6 +2341,81 @@ class Ontology
|
|
1974
2341
|
return results
|
1975
2342
|
end
|
1976
2343
|
|
2344
|
+
def compute_relations_weight(terms_levels, external_item_list, total_items, item_list)
|
2345
|
+
pvals = {}
|
2346
|
+
item_weigths_per_term = Hash.new { |hash, key| Hash.new(1) } #https://mensfeld.pl/2016/09/ruby-hash-default-value-be-cautious-when-you-use-it/
|
2347
|
+
levels = terms_levels.keys.sort
|
2348
|
+
levels.reverse_each do |level|
|
2349
|
+
terms_levels[level].each do |term|
|
2350
|
+
associated_items = item_list[term]
|
2351
|
+
#initialize observed items in item_weigths_per_term list
|
2352
|
+
add_items_to_weigthed_list(term, associated_items, item_weigths_per_term) if !associated_items.nil?
|
2353
|
+
children = @dicts[:is_a][:byValue][term]
|
2354
|
+
if children.nil?
|
2355
|
+
children = []
|
2356
|
+
else
|
2357
|
+
children = children.select{|ch| item_weigths_per_term[ch].length > 0} # Only use children with items associated to them OR transfered to them
|
2358
|
+
end
|
2359
|
+
computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
|
2360
|
+
end
|
2361
|
+
end
|
2362
|
+
return pvals.to_a
|
2363
|
+
end
|
2364
|
+
|
2365
|
+
def add_items_to_weigthed_list(term, associated_items, weigthed_list)
|
2366
|
+
term_weigthing = weigthed_list[term]
|
2367
|
+
associated_items.each{|ai| term_weigthing[ai] = 1}
|
2368
|
+
weigthed_list[term] = term_weigthing
|
2369
|
+
end
|
2370
|
+
|
2371
|
+
def computeTermSig(term, children, external_item_list, total_items, pvals, item_weigths_per_term)
|
2372
|
+
#puts term.to_s.red
|
2373
|
+
#puts @term_paths[term].inspect
|
2374
|
+
#puts @dicts[:is_a][:byValue][term].inspect.light_blue
|
2375
|
+
associated_items = item_weigths_per_term[term].keys
|
2376
|
+
pval = get_fisher_exact_test(external_item_list, associated_items, total_items,
|
2377
|
+
'two_sided', item_weigths_per_term[term], true)
|
2378
|
+
pvals[term] = pval
|
2379
|
+
if children.length > 0
|
2380
|
+
rates = {}
|
2381
|
+
sig_child = 0
|
2382
|
+
children.each do |child|
|
2383
|
+
ratio = sigRatio(pvals[child], pval)
|
2384
|
+
rates[child] = ratio
|
2385
|
+
sig_child += 1 if ratio >= 1
|
2386
|
+
end
|
2387
|
+
if sig_child == 0 # CASE 1
|
2388
|
+
children.each do |child|
|
2389
|
+
current_ratio = rates[child]
|
2390
|
+
query_child = item_weigths_per_term[child]
|
2391
|
+
query_child.transform_values!{|weight| weight * current_ratio}
|
2392
|
+
pvals[child] = get_fisher_exact_test(external_item_list, item_weigths_per_term[child].keys, total_items,
|
2393
|
+
'two_sided', item_weigths_per_term[child], true)
|
2394
|
+
end
|
2395
|
+
else
|
2396
|
+
ancs = get_ancestors(term, filter_alternatives = true)
|
2397
|
+
ancs << term
|
2398
|
+
rates.each do |ch, ratio|# CASE 2
|
2399
|
+
if ratio >= 1 # The child is better than parent
|
2400
|
+
ancs.each do |anc|
|
2401
|
+
query_anc = item_weigths_per_term[anc]
|
2402
|
+
associated_items.each do |item|
|
2403
|
+
query_anc[item] /= ratio # /= --> query_anc[item]/ratio
|
2404
|
+
end
|
2405
|
+
end
|
2406
|
+
end
|
2407
|
+
end
|
2408
|
+
computeTermSig(term, children - rates.keys, external_item_list, total_items, pvals, item_weigths_per_term)
|
2409
|
+
end
|
2410
|
+
end
|
2411
|
+
end
|
2412
|
+
|
2413
|
+
def sigRatio(pvalA, pvalB)
|
2414
|
+
return Math.log(pvalA)/Math.log(pvalB)
|
2415
|
+
end
|
2416
|
+
|
2417
|
+
#============================================================================
|
2418
|
+
#============================================================================
|
1977
2419
|
|
1978
2420
|
# Check if a given ID is a removable (blacklist) term.
|
1979
2421
|
# +DEPRECATED+ use is_removable? instead
|
data/lib/semtools/version.rb
CHANGED
data/semtools.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: semtools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- seoanezonjic
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-05-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: text
|
@@ -53,6 +53,20 @@ dependencies:
|
|
53
53
|
- - ">="
|
54
54
|
- !ruby/object:Gem::Version
|
55
55
|
version: '0'
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: colorize
|
58
|
+
requirement: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 0.7.3
|
63
|
+
type: :runtime
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.7.3
|
56
70
|
description: This gem allows to perform ontology based operations and calculation
|
57
71
|
of Semantic similarity and information coefficient using different implementations.
|
58
72
|
email:
|