RubyGems - text_alignment - Versions diffs - 0.2.2 → 0.2.7 - Mend

text_alignment 0.2.2 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/Gemfile +2 -1
data/Gemfile.lock +3 -1
data/bin/align_annotations +52 -0
data/lib/text_alignment/approximate_fit.rb +31 -17
data/lib/text_alignment/lcs_cdiff.rb +1 -1
data/lib/text_alignment/text_alignment.rb +5 -46
data/lib/text_alignment/version.rb +1 -1
data/text_alignment.gemspec +1 -0
metadata +20 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3c42bc61965d0707fb1c8bd9815b363b04dc7d598e6c3bbbc7e318e8d7df37c6
-  data.tar.gz: a699745c6a92a5c980ca90add8d5e46128923fb774f204edb35514ed6b2df097
+  metadata.gz: 45bf8e55a66daff4d79765aec0e4d7482151aa7b86749e054e5d93030ffffabe
+  data.tar.gz: 6251835e35077e2ae828666026a663ae63be7f8d7b7d785b4898763021f546e2
 SHA512:
-  metadata.gz: 7f03b5fa91cb4056844c8a02a48321fedc8a8e7f5a617f720b325c5c8bc40f43bc1e0ffebd25bce82777e1efce2421f0926a4a37dbcfe244b3792a7a19fe103f
-  data.tar.gz: cc5cd6a308fd99188b753ff28eb885e46aaf7b0dd51fa83046e2a12d9b9a4723eebdfa90be21ada755f7819362dc1974ff6f288b0799184880f19d235ae38ec6
+  metadata.gz: 4750b2507b4eb172123ca9a82320c2456e66d3f06ca3e3bdd52cf31055e9cb46af53013a484dbb87b1bb3236701cc8a69e1c4c64b576ee41a3a4957c3431be65
+  data.tar.gz: 1819b94e47bdf064d53712bb667f15d74f8e37bd49df7976075c3cbdc0a992d25bb318f76bfc0522cea0b594d002bcab0ebcef6112bf9e26128c245a7fb67bcf

data/Gemfile CHANGED

@@ -1,8 +1,9 @@
 source 'https://rubygems.org'
-ruby '2.3.4'
+ruby '2.5.5'
 gem 'diff-lcs', '~> 1.3'
 gem 'ruby-dictionary', '~>1.1', '>=1.1.1'
+gem 'string-similarity', '~> 2.1'
 group :test do
 	gem 'rspec', '~>3.0'

data/Gemfile.lock CHANGED

@@ -15,6 +15,7 @@ GEM
       rspec-support (~> 3.0.0)
     rspec-support (3.0.4)
     ruby-dictionary (1.1.1)
+    string-similarity (2.1.0)
 PLATFORMS
   ruby
@@ -23,9 +24,10 @@ DEPENDENCIES
   diff-lcs (~> 1.3)
   rspec (~> 3.0)
   ruby-dictionary (~> 1.1, >= 1.1.1)
+  string-similarity (~> 2.1)
 RUBY VERSION
-   ruby 2.3.4p301
+   ruby 2.5.5p157
 BUNDLED WITH
    1.17.3

data/bin/align_annotations ADDED

@@ -0,0 +1,52 @@
+#!/usr/bin/env ruby
+require 'text_alignment'
+require 'json'
+require 'pp'
+unless ARGV.length == 2
+	warn "align_annotations target_annotations(.json) reference_annotations(.json)"
+	exit
+end
+anns1 = JSON.parse File.read(ARGV[0].strip), :symbolize_names => true
+anns2 = JSON.parse File.read(ARGV[1].strip), :symbolize_names => true
+str1 = anns1[:text]
+str2 = anns2[:text]
+denotations = anns1[:denotations]
+puts "[Alignment1]====="
+align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
+puts TextAlignment::sdiff2cdiff(align.sdiff)
+puts
+puts "[Similarity]\n#{align.similarity}"
+puts
+puts '[Denotations original]'
+pp denotations
+puts
+puts '[Denotations transformed]'
+new_denotations = align.transform_hdenotations(denotations)
+pp new_denotations
+puts
+puts "[Alignment2 (downcased)]====="
+align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
+puts TextAlignment::sdiff2cdiff(align.sdiff)
+puts
+puts "[Similarity]\n#{align.similarity}"
+puts
+puts '[Denotations original]'
+pp denotations
+puts
+puts '[Denotations transformed]'
+new_denotations = align.transform_hdenotations(denotations)
+pp new_denotations
+puts
+puts '[Annotations transformed]'
+anns2[:denotations] = new_denotations
+puts anns2.to_json
+# p align.common_elements
+# puts "---------------"
+# p align.mapped_elements

data/lib/text_alignment/approximate_fit.rb CHANGED

@@ -1,11 +1,14 @@
 #!/usr/bin/env ruby
+require 'string-similarity'
 module TextAlignment; end unless defined? TextAlignment
 # approximate the location of str1 in str2
 module TextAlignment
   SIGNATURE_NGRAM = 5
   MIN_LENGTH_FOR_APPROXIMATION = 50
-  BUFFER_RATE = 0.2
+  BUFFER_RATE = 0.1
+  TEXT_SIMILARITY_TRESHOLD = 0.8
 end
 class << TextAlignment
@@ -22,29 +25,40 @@ class << TextAlignment
     # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
     return nil, nil if ngram_shared.empty?
-    # approximate the beginning of the fit
-    signature_ngram = ngram_shared.detect{|g| ngram2.count(g) == 1}
+    signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
+    return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
+    fit_begin, fit_end = nil, nil
+    signature_ngrams.each do |signature_ngram|
+      loc_signature_ngram_in_str1 = str1.index(signature_ngram)
+      loc_signature_ngram_in_str2 = str2.index(signature_ngram)
-    return nil, nil if signature_ngram.nil? #raise "no signature ngram"
-    offset = str1.index(signature_ngram)
-    fit_begin = str2.index(signature_ngram) - offset - (offset * TextAlignment::BUFFER_RATE).to_i
-    fit_begin = 0 if fit_begin < 0
+      # approximate the beginning of the fit
+      fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
+      fit_begin = 0 if fit_begin < 0
-    # to change the order according to ngram2
-    ngram_shared = ngram2 & ngram1
+      # approximate the end of the fit
+      offset_end = str1.length - loc_signature_ngram_in_str1
+      fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
+      fit_end = str2.length if fit_end > str2.length
-    # approximate the end of the fit
-    ngram_shared_reverse = ngram_shared.reverse
-    ngram2_reverse = ngram2.reverse
-    signature_ngram = ngram_shared_reverse.detect{|g| ngram2_reverse.count(g) == 1}
-    return nil, nil if signature_ngram.nil? # raise "no signature ngram"
-    offset = str1.length - str1.rindex(signature_ngram)
-    fit_end = str2.rindex(signature_ngram) + offset + (offset * TextAlignment::BUFFER_RATE).to_i
-    fit_end = str2.length if fit_end > str2.length
+      text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
+      break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
+      fit_begin, fit_end = nil, nil
+    end
     return nil, nil if fit_begin >= fit_end
     return fit_begin, fit_end
   end
+  private
+  def text_similarity(str1, str2, ngram_order = 3)
+    _str1 = str1.delete(" \t\r\n")
+    _str2 = str2.delete(" \t\r\n")
+    String::Similarity.cosine(_str1, _str2, ngram:2)
+  end
 end
 if __FILE__ == $0

data/lib/text_alignment/lcs_cdiff.rb CHANGED

@@ -37,7 +37,7 @@ class << TextAlignment
       end
     end
-    cdiff_str1.gsub(/\n/, ' ') + "\n" + cdiff_str2.gsub(/\n/, ' ')
+    cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ')
   end
 end

data/lib/text_alignment/text_alignment.rb CHANGED

@@ -4,6 +4,7 @@ require 'text_alignment/lcs_min'
 require 'text_alignment/find_divisions'
 require 'text_alignment/lcs_comparison'
 require 'text_alignment/lcs_alignment'
+require 'text_alignment/lcs_cdiff'
 require 'text_alignment/glcs_alignment'
 require 'text_alignment/mappings'
@@ -26,6 +27,7 @@ class TextAlignment::TextAlignment
     ## preprocessing
     str1 = str1.dup
     str2 = str2.dup
+    mappings = mappings.dup
     ## find the first nomatch character
     TextAlignment::NOMATCH_CHARS.each_char do |c|
@@ -77,7 +79,7 @@ class TextAlignment::TextAlignment
   end
   def transform_a_span(span)
-    {:begin=>@position_map_begin[span[:begin]], :end=>@position_map_end[span[:end]]}
+    {begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
   end
   def transform_spans(spans)
@@ -89,11 +91,8 @@ class TextAlignment::TextAlignment
   end
   def transform_hdenotations(hdenotations)
-    unless hdenotations.nil?
-      hdenotations_new = Array.new(hdenotations)
-      (0...hdenotations.length).each {|i| hdenotations_new[i][:span] = transform_a_span(hdenotations[i][:span])}
-      hdenotations_new
-    end
+    return nil if hdenotations.nil?
+    hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
   end
   private
@@ -180,43 +179,3 @@ class TextAlignment::TextAlignment
     @position_map_end = posmap_end.sort.to_h
   end
 end
-if __FILE__ == $0
-  require 'json'
-  require 'text_alignment/lcs_cdiff'
-  str1 = "TI  - Identification of a region which directs the monocytic activity of the\n      colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor\n      promoter and binds PEBP2/CBF (AML1)."
-  str2 = "Identification of a region which directs the monocytic activity of the colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor promoter and binds PEBP2/CBF (AML1).\nThe receptor for the macrophage colony-stimulating factor (or colony-stimulating factor 1 [CSF-1]) is expressed from different promoters in monocytic cells and placental trophoblasts. We have demonstrated that the monocyte-specific expression of the CSF-1 receptor is regulated at the level of transcription by a tissue-specific promoter whose activity is stimulated by the monocyte/B-cell-specific transcription factor PU.1 (D.-E. Zhang, C.J. Hetherington, H.-M. Chen, and D.G. Tenen, Mol. Cell. Biol. 14:373-381, 1994). Here we report that the tissue specificity of this promoter is also mediated by sequences in a region II (bp -88 to -59), which lies 10 bp upstream from the PU.1-binding site. When analyzed by DNase footprinting, region II was protected preferentially in monocytic cells. Electrophoretic mobility shift assays confirmed that region II interacts specifically with nuclear proteins from monocytic cells. Two gel shift complexes (Mono A and Mono B) were formed with separate sequence elements within this region. Competition and supershift experiments indicate that Mono B contains a member of the polyomavirus enhancer-binding protein 2/core-binding factor (PEBP2/CBF) family, which includes the AML1 gene product, while Mono A is a distinct complex preferentially expressed in monocytic cells. Promoter constructs with mutations in these sequence elements were no longer expressed specifically in monocytes. Furthermore, multimerized region II sequence elements enhanced the activity of a heterologous thymidine kinase promoter in monocytic cells but not other cell types tested. These results indicate that the monocyte/B-cell-specific transcription factor PU.1 and the Mono A and Mono B protein complexes act in concert to regulate monocyte-specific transcription of the CSF-1 receptor."
-  # anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
-  # anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
-  if ARGV.length == 2
-    str1  = JSON.parse(File.read(ARGV[0]).strip)["text"]
-    denotations = JSON.parse(File.read(ARGV[0]).strip, symbolize_names:true)[:denotations]
-    str2  = JSON.parse(File.read(ARGV[1]).strip)["text"]
-    # str1 = File.read(ARGV[0])
-    # str2 = File.read(ARGV[1])
-  end
-  # dictionary = [["β", "beta"]]
-  # align = TextAlignment::TextAlignment.new(str1, str2)
-  align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
-  # p align.common_elements
-  # puts "---------------"
-  # p align.mapped_elements
-  puts TextAlignment::sdiff2cdiff(align.sdiff)
-  # p align
-  # puts "-----"
-  # p denotations
-  # puts "-----"
-  # new_denotations = align.transform_hdenotations(denotations)
-  # p new_denotations
-  # puts "-----"
-end

data/lib/text_alignment/version.rb CHANGED

@@ -1,3 +1,3 @@
 class TextAlignment
-  VERSION = '0.2.2'
+  VERSION = '0.2.7'
 end

data/text_alignment.gemspec CHANGED

@@ -18,5 +18,6 @@ Gem::Specification.new do |gem|
   gem.require_paths = ['lib']
   gem.add_development_dependency 'ruby-dictionary', '~>1.1', '>=1.1.1'
+  gem.add_development_dependency 'string-similarity', '~> 2.1'
   gem.add_development_dependency 'rspec', '~>3.0'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_alignment
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.2.7
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-06-10 00:00:00.000000000 Z
+date: 2020-07-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-dictionary
@@ -30,6 +30,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: 1.1.1
+- !ruby/object:Gem::Dependency
+  name: string-similarity
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.1'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.1'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
@@ -49,7 +63,8 @@ description: |-
                            of two character strings and annotations made to them.
 email:
 - jdkim@dbcls.rois.ac.jp
-executables: []
+executables:
+- align_annotations
 extensions: []
 extra_rdoc_files: []
 files:
@@ -58,6 +73,7 @@ files:
 - Gemfile.lock
 - LICENSE.txt
 - README.md
+- bin/align_annotations
 - lib/text_alignment.rb
 - lib/text_alignment/approximate_fit.rb
 - lib/text_alignment/find_divisions.rb
@@ -96,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.3
+rubygems_version: 3.0.8
 signing_key:
 specification_version: 4
 summary: Ruby class for aligning two character strings