text_alignment 0.2.2 → 0.2.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3c42bc61965d0707fb1c8bd9815b363b04dc7d598e6c3bbbc7e318e8d7df37c6
4
- data.tar.gz: a699745c6a92a5c980ca90add8d5e46128923fb774f204edb35514ed6b2df097
3
+ metadata.gz: 45bf8e55a66daff4d79765aec0e4d7482151aa7b86749e054e5d93030ffffabe
4
+ data.tar.gz: 6251835e35077e2ae828666026a663ae63be7f8d7b7d785b4898763021f546e2
5
5
  SHA512:
6
- metadata.gz: 7f03b5fa91cb4056844c8a02a48321fedc8a8e7f5a617f720b325c5c8bc40f43bc1e0ffebd25bce82777e1efce2421f0926a4a37dbcfe244b3792a7a19fe103f
7
- data.tar.gz: cc5cd6a308fd99188b753ff28eb885e46aaf7b0dd51fa83046e2a12d9b9a4723eebdfa90be21ada755f7819362dc1974ff6f288b0799184880f19d235ae38ec6
6
+ metadata.gz: 4750b2507b4eb172123ca9a82320c2456e66d3f06ca3e3bdd52cf31055e9cb46af53013a484dbb87b1bb3236701cc8a69e1c4c64b576ee41a3a4957c3431be65
7
+ data.tar.gz: 1819b94e47bdf064d53712bb667f15d74f8e37bd49df7976075c3cbdc0a992d25bb318f76bfc0522cea0b594d002bcab0ebcef6112bf9e26128c245a7fb67bcf
data/Gemfile CHANGED
@@ -1,8 +1,9 @@
1
1
  source 'https://rubygems.org'
2
- ruby '2.3.4'
2
+ ruby '2.5.5'
3
3
 
4
4
  gem 'diff-lcs', '~> 1.3'
5
5
  gem 'ruby-dictionary', '~>1.1', '>=1.1.1'
6
+ gem 'string-similarity', '~> 2.1'
6
7
 
7
8
  group :test do
8
9
  gem 'rspec', '~>3.0'
@@ -15,6 +15,7 @@ GEM
15
15
  rspec-support (~> 3.0.0)
16
16
  rspec-support (3.0.4)
17
17
  ruby-dictionary (1.1.1)
18
+ string-similarity (2.1.0)
18
19
 
19
20
  PLATFORMS
20
21
  ruby
@@ -23,9 +24,10 @@ DEPENDENCIES
23
24
  diff-lcs (~> 1.3)
24
25
  rspec (~> 3.0)
25
26
  ruby-dictionary (~> 1.1, >= 1.1.1)
27
+ string-similarity (~> 2.1)
26
28
 
27
29
  RUBY VERSION
28
- ruby 2.3.4p301
30
+ ruby 2.5.5p157
29
31
 
30
32
  BUNDLED WITH
31
33
  1.17.3
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+ require 'text_alignment'
3
+
4
+ require 'json'
5
+ require 'pp'
6
+
7
+ unless ARGV.length == 2
8
+ warn "align_annotations target_annotations(.json) reference_annotations(.json)"
9
+ exit
10
+ end
11
+
12
+ anns1 = JSON.parse File.read(ARGV[0].strip), :symbolize_names => true
13
+ anns2 = JSON.parse File.read(ARGV[1].strip), :symbolize_names => true
14
+
15
+ str1 = anns1[:text]
16
+ str2 = anns2[:text]
17
+
18
+ denotations = anns1[:denotations]
19
+
20
+ puts "[Alignment1]====="
21
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
22
+ puts TextAlignment::sdiff2cdiff(align.sdiff)
23
+ puts
24
+ puts "[Similarity]\n#{align.similarity}"
25
+ puts
26
+ puts '[Denotations original]'
27
+ pp denotations
28
+ puts
29
+ puts '[Denotations transformed]'
30
+ new_denotations = align.transform_hdenotations(denotations)
31
+ pp new_denotations
32
+ puts
33
+ puts "[Alignment2 (downcased)]====="
34
+ align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
35
+ puts TextAlignment::sdiff2cdiff(align.sdiff)
36
+ puts
37
+ puts "[Similarity]\n#{align.similarity}"
38
+ puts
39
+ puts '[Denotations original]'
40
+ pp denotations
41
+ puts
42
+ puts '[Denotations transformed]'
43
+ new_denotations = align.transform_hdenotations(denotations)
44
+ pp new_denotations
45
+ puts
46
+ puts '[Annotations transformed]'
47
+ anns2[:denotations] = new_denotations
48
+ puts anns2.to_json
49
+
50
+ # p align.common_elements
51
+ # puts "---------------"
52
+ # p align.mapped_elements
@@ -1,11 +1,14 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'string-similarity'
3
+
2
4
  module TextAlignment; end unless defined? TextAlignment
3
5
 
4
6
  # approximate the location of str1 in str2
5
7
  module TextAlignment
6
8
  SIGNATURE_NGRAM = 5
7
9
  MIN_LENGTH_FOR_APPROXIMATION = 50
8
- BUFFER_RATE = 0.2
10
+ BUFFER_RATE = 0.1
11
+ TEXT_SIMILARITY_TRESHOLD = 0.8
9
12
  end
10
13
 
11
14
  class << TextAlignment
@@ -22,29 +25,40 @@ class << TextAlignment
22
25
  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
23
26
  return nil, nil if ngram_shared.empty?
24
27
 
25
- # approximate the beginning of the fit
26
- signature_ngram = ngram_shared.detect{|g| ngram2.count(g) == 1}
28
+ signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
29
+ return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
30
+
31
+ fit_begin, fit_end = nil, nil
32
+ signature_ngrams.each do |signature_ngram|
33
+ loc_signature_ngram_in_str1 = str1.index(signature_ngram)
34
+ loc_signature_ngram_in_str2 = str2.index(signature_ngram)
27
35
 
28
- return nil, nil if signature_ngram.nil? #raise "no signature ngram"
29
- offset = str1.index(signature_ngram)
30
- fit_begin = str2.index(signature_ngram) - offset - (offset * TextAlignment::BUFFER_RATE).to_i
31
- fit_begin = 0 if fit_begin < 0
36
+ # approximate the beginning of the fit
37
+ fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
38
+ fit_begin = 0 if fit_begin < 0
32
39
 
33
- # to change the order according to ngram2
34
- ngram_shared = ngram2 & ngram1
40
+ # approximate the end of the fit
41
+ offset_end = str1.length - loc_signature_ngram_in_str1
42
+ fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
43
+ fit_end = str2.length if fit_end > str2.length
35
44
 
36
- # approximate the end of the fit
37
- ngram_shared_reverse = ngram_shared.reverse
38
- ngram2_reverse = ngram2.reverse
39
- signature_ngram = ngram_shared_reverse.detect{|g| ngram2_reverse.count(g) == 1}
40
- return nil, nil if signature_ngram.nil? # raise "no signature ngram"
41
- offset = str1.length - str1.rindex(signature_ngram)
42
- fit_end = str2.rindex(signature_ngram) + offset + (offset * TextAlignment::BUFFER_RATE).to_i
43
- fit_end = str2.length if fit_end > str2.length
45
+ text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
46
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
47
+ fit_begin, fit_end = nil, nil
48
+ end
44
49
 
45
50
  return nil, nil if fit_begin >= fit_end
46
51
  return fit_begin, fit_end
47
52
  end
53
+
54
+ private
55
+
56
+ def text_similarity(str1, str2, ngram_order = 3)
57
+ _str1 = str1.delete(" \t\r\n")
58
+ _str2 = str2.delete(" \t\r\n")
59
+ String::Similarity.cosine(_str1, _str2, ngram:2)
60
+ end
61
+
48
62
  end
49
63
 
50
64
  if __FILE__ == $0
@@ -37,7 +37,7 @@ class << TextAlignment
37
37
  end
38
38
  end
39
39
 
40
- cdiff_str1.gsub(/\n/, ' ') + "\n" + cdiff_str2.gsub(/\n/, ' ')
40
+ cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ')
41
41
  end
42
42
 
43
43
  end
@@ -4,6 +4,7 @@ require 'text_alignment/lcs_min'
4
4
  require 'text_alignment/find_divisions'
5
5
  require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
+ require 'text_alignment/lcs_cdiff'
7
8
  require 'text_alignment/glcs_alignment'
8
9
  require 'text_alignment/mappings'
9
10
 
@@ -26,6 +27,7 @@ class TextAlignment::TextAlignment
26
27
  ## preprocessing
27
28
  str1 = str1.dup
28
29
  str2 = str2.dup
30
+ mappings = mappings.dup
29
31
 
30
32
  ## find the first nomatch character
31
33
  TextAlignment::NOMATCH_CHARS.each_char do |c|
@@ -77,7 +79,7 @@ class TextAlignment::TextAlignment
77
79
  end
78
80
 
79
81
  def transform_a_span(span)
80
- {:begin=>@position_map_begin[span[:begin]], :end=>@position_map_end[span[:end]]}
82
+ {begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
81
83
  end
82
84
 
83
85
  def transform_spans(spans)
@@ -89,11 +91,8 @@ class TextAlignment::TextAlignment
89
91
  end
90
92
 
91
93
  def transform_hdenotations(hdenotations)
92
- unless hdenotations.nil?
93
- hdenotations_new = Array.new(hdenotations)
94
- (0...hdenotations.length).each {|i| hdenotations_new[i][:span] = transform_a_span(hdenotations[i][:span])}
95
- hdenotations_new
96
- end
94
+ return nil if hdenotations.nil?
95
+ hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
97
96
  end
98
97
 
99
98
  private
@@ -180,43 +179,3 @@ class TextAlignment::TextAlignment
180
179
  @position_map_end = posmap_end.sort.to_h
181
180
  end
182
181
  end
183
-
184
- if __FILE__ == $0
185
- require 'json'
186
- require 'text_alignment/lcs_cdiff'
187
-
188
- str1 = "TI - Identification of a region which directs the monocytic activity of the\n colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor\n promoter and binds PEBP2/CBF (AML1)."
189
- str2 = "Identification of a region which directs the monocytic activity of the colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor promoter and binds PEBP2/CBF (AML1).\nThe receptor for the macrophage colony-stimulating factor (or colony-stimulating factor 1 [CSF-1]) is expressed from different promoters in monocytic cells and placental trophoblasts. We have demonstrated that the monocyte-specific expression of the CSF-1 receptor is regulated at the level of transcription by a tissue-specific promoter whose activity is stimulated by the monocyte/B-cell-specific transcription factor PU.1 (D.-E. Zhang, C.J. Hetherington, H.-M. Chen, and D.G. Tenen, Mol. Cell. Biol. 14:373-381, 1994). Here we report that the tissue specificity of this promoter is also mediated by sequences in a region II (bp -88 to -59), which lies 10 bp upstream from the PU.1-binding site. When analyzed by DNase footprinting, region II was protected preferentially in monocytic cells. Electrophoretic mobility shift assays confirmed that region II interacts specifically with nuclear proteins from monocytic cells. Two gel shift complexes (Mono A and Mono B) were formed with separate sequence elements within this region. Competition and supershift experiments indicate that Mono B contains a member of the polyomavirus enhancer-binding protein 2/core-binding factor (PEBP2/CBF) family, which includes the AML1 gene product, while Mono A is a distinct complex preferentially expressed in monocytic cells. Promoter constructs with mutations in these sequence elements were no longer expressed specifically in monocytes. Furthermore, multimerized region II sequence elements enhanced the activity of a heterologous thymidine kinase promoter in monocytic cells but not other cell types tested. These results indicate that the monocyte/B-cell-specific transcription factor PU.1 and the Mono A and Mono B protein complexes act in concert to regulate monocyte-specific transcription of the CSF-1 receptor."
190
-
191
- # anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
192
- # anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
193
-
194
- if ARGV.length == 2
195
- str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
196
- denotations = JSON.parse(File.read(ARGV[0]).strip, symbolize_names:true)[:denotations]
197
- str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
198
- # str1 = File.read(ARGV[0])
199
- # str2 = File.read(ARGV[1])
200
- end
201
-
202
- # dictionary = [["β", "beta"]]
203
- # align = TextAlignment::TextAlignment.new(str1, str2)
204
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
205
-
206
- # p align.common_elements
207
- # puts "---------------"
208
- # p align.mapped_elements
209
-
210
- puts TextAlignment::sdiff2cdiff(align.sdiff)
211
- # p align
212
- # puts "-----"
213
-
214
- # p denotations
215
- # puts "-----"
216
-
217
- # new_denotations = align.transform_hdenotations(denotations)
218
-
219
- # p new_denotations
220
- # puts "-----"
221
-
222
- end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.2.2'
2
+ VERSION = '0.2.7'
3
3
  end
@@ -18,5 +18,6 @@ Gem::Specification.new do |gem|
18
18
  gem.require_paths = ['lib']
19
19
 
20
20
  gem.add_development_dependency 'ruby-dictionary', '~>1.1', '>=1.1.1'
21
+ gem.add_development_dependency 'string-similarity', '~> 2.1'
21
22
  gem.add_development_dependency 'rspec', '~>3.0'
22
23
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-06-10 00:00:00.000000000 Z
11
+ date: 2020-07-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -30,6 +30,20 @@ dependencies:
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.1.1
33
+ - !ruby/object:Gem::Dependency
34
+ name: string-similarity
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.1'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.1'
33
47
  - !ruby/object:Gem::Dependency
34
48
  name: rspec
35
49
  requirement: !ruby/object:Gem::Requirement
@@ -49,7 +63,8 @@ description: |-
49
63
  of two character strings and annotations made to them.
50
64
  email:
51
65
  - jdkim@dbcls.rois.ac.jp
52
- executables: []
66
+ executables:
67
+ - align_annotations
53
68
  extensions: []
54
69
  extra_rdoc_files: []
55
70
  files:
@@ -58,6 +73,7 @@ files:
58
73
  - Gemfile.lock
59
74
  - LICENSE.txt
60
75
  - README.md
76
+ - bin/align_annotations
61
77
  - lib/text_alignment.rb
62
78
  - lib/text_alignment/approximate_fit.rb
63
79
  - lib/text_alignment/find_divisions.rb
@@ -96,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
96
112
  - !ruby/object:Gem::Version
97
113
  version: '0'
98
114
  requirements: []
99
- rubygems_version: 3.0.3
115
+ rubygems_version: 3.0.8
100
116
  signing_key:
101
117
  specification_version: 4
102
118
  summary: Ruby class for aligning two character strings