text_alignment 0.2.6 → 0.2.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e9a7bcce440fe7a86655b4c8edceb92607c810a4cdf6d05baee3ffe55199db6
4
- data.tar.gz: 739bed915b416c59c908c396f312f6b04df79cbad4075cbcbfab3a82db610fd5
3
+ metadata.gz: 45bf8e55a66daff4d79765aec0e4d7482151aa7b86749e054e5d93030ffffabe
4
+ data.tar.gz: 6251835e35077e2ae828666026a663ae63be7f8d7b7d785b4898763021f546e2
5
5
  SHA512:
6
- metadata.gz: f2fe7ba9a6bee8150572f7813f9ce680c8fd3fe1252292f351082d95293392969d4094e492e04f844c0bc956cbb8934213937e11b75789f4b6432f31c1e7fad0
7
- data.tar.gz: aed5b749142693d86f26c8bdd0d68ba89727f5b8dbab916f0f476d56d253ccf63a0ea908230192d4d5e1511124b515c013fde90ffa6873a28b3124702c926339
6
+ metadata.gz: 4750b2507b4eb172123ca9a82320c2456e66d3f06ca3e3bdd52cf31055e9cb46af53013a484dbb87b1bb3236701cc8a69e1c4c64b576ee41a3a4957c3431be65
7
+ data.tar.gz: 1819b94e47bdf064d53712bb667f15d74f8e37bd49df7976075c3cbdc0a992d25bb318f76bfc0522cea0b594d002bcab0ebcef6112bf9e26128c245a7fb67bcf
data/Gemfile CHANGED
@@ -1,8 +1,9 @@
1
1
  source 'https://rubygems.org'
2
- ruby '2.3.4'
2
+ ruby '2.5.5'
3
3
 
4
4
  gem 'diff-lcs', '~> 1.3'
5
5
  gem 'ruby-dictionary', '~>1.1', '>=1.1.1'
6
+ gem 'string-similarity', '~> 2.1'
6
7
 
7
8
  group :test do
8
9
  gem 'rspec', '~>3.0'
@@ -15,6 +15,7 @@ GEM
15
15
  rspec-support (~> 3.0.0)
16
16
  rspec-support (3.0.4)
17
17
  ruby-dictionary (1.1.1)
18
+ string-similarity (2.1.0)
18
19
 
19
20
  PLATFORMS
20
21
  ruby
@@ -23,9 +24,10 @@ DEPENDENCIES
23
24
  diff-lcs (~> 1.3)
24
25
  rspec (~> 3.0)
25
26
  ruby-dictionary (~> 1.1, >= 1.1.1)
27
+ string-similarity (~> 2.1)
26
28
 
27
29
  RUBY VERSION
28
- ruby 2.3.4p301
30
+ ruby 2.5.5p157
29
31
 
30
32
  BUNDLED WITH
31
33
  1.17.3
@@ -42,6 +42,10 @@ puts
42
42
  puts '[Denotations transformed]'
43
43
  new_denotations = align.transform_hdenotations(denotations)
44
44
  pp new_denotations
45
+ puts
46
+ puts '[Annotations transformed]'
47
+ anns2[:denotations] = new_denotations
48
+ puts anns2.to_json
45
49
 
46
50
  # p align.common_elements
47
51
  # puts "---------------"
@@ -1,11 +1,14 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'string-similarity'
3
+
2
4
  module TextAlignment; end unless defined? TextAlignment
3
5
 
4
6
  # approximate the location of str1 in str2
5
7
  module TextAlignment
6
8
  SIGNATURE_NGRAM = 5
7
9
  MIN_LENGTH_FOR_APPROXIMATION = 50
8
- BUFFER_RATE = 0.2
10
+ BUFFER_RATE = 0.1
11
+ TEXT_SIMILARITY_TRESHOLD = 0.8
9
12
  end
10
13
 
11
14
  class << TextAlignment
@@ -22,29 +25,40 @@ class << TextAlignment
22
25
  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
23
26
  return nil, nil if ngram_shared.empty?
24
27
 
25
- # approximate the beginning of the fit
26
- signature_ngram = ngram_shared.detect{|g| ngram2.count(g) == 1}
28
+ signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
29
+ return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
30
+
31
+ fit_begin, fit_end = nil, nil
32
+ signature_ngrams.each do |signature_ngram|
33
+ loc_signature_ngram_in_str1 = str1.index(signature_ngram)
34
+ loc_signature_ngram_in_str2 = str2.index(signature_ngram)
27
35
 
28
- return nil, nil if signature_ngram.nil? #raise "no signature ngram"
29
- offset = str1.index(signature_ngram)
30
- fit_begin = str2.index(signature_ngram) - offset - (offset * TextAlignment::BUFFER_RATE).to_i
31
- fit_begin = 0 if fit_begin < 0
36
+ # approximate the beginning of the fit
37
+ fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
38
+ fit_begin = 0 if fit_begin < 0
32
39
 
33
- # to change the order according to ngram2
34
- ngram_shared = ngram2 & ngram1
40
+ # approximate the end of the fit
41
+ offset_end = str1.length - loc_signature_ngram_in_str1
42
+ fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
43
+ fit_end = str2.length if fit_end > str2.length
35
44
 
36
- # approximate the end of the fit
37
- ngram_shared_reverse = ngram_shared.reverse
38
- ngram2_reverse = ngram2.reverse
39
- signature_ngram = ngram_shared_reverse.detect{|g| ngram2_reverse.count(g) == 1}
40
- return nil, nil if signature_ngram.nil? # raise "no signature ngram"
41
- offset = str1.length - str1.rindex(signature_ngram)
42
- fit_end = str2.rindex(signature_ngram) + offset + (offset * TextAlignment::BUFFER_RATE).to_i
43
- fit_end = str2.length if fit_end > str2.length
45
+ text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
46
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
47
+ fit_begin, fit_end = nil, nil
48
+ end
44
49
 
45
50
  return nil, nil if fit_begin >= fit_end
46
51
  return fit_begin, fit_end
47
52
  end
53
+
54
+ private
55
+
56
+ def text_similarity(str1, str2, ngram_order = 3)
57
+ _str1 = str1.delete(" \t\r\n")
58
+ _str2 = str2.delete(" \t\r\n")
59
+ String::Similarity.cosine(_str1, _str2, ngram:2)
60
+ end
61
+
48
62
  end
49
63
 
50
64
  if __FILE__ == $0
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.2.6'
2
+ VERSION = '0.2.7'
3
3
  end
@@ -18,5 +18,6 @@ Gem::Specification.new do |gem|
18
18
  gem.require_paths = ['lib']
19
19
 
20
20
  gem.add_development_dependency 'ruby-dictionary', '~>1.1', '>=1.1.1'
21
+ gem.add_development_dependency 'string-similarity', '~> 2.1'
21
22
  gem.add_development_dependency 'rspec', '~>3.0'
22
23
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-07-12 00:00:00.000000000 Z
11
+ date: 2020-07-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -30,6 +30,20 @@ dependencies:
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.1.1
33
+ - !ruby/object:Gem::Dependency
34
+ name: string-similarity
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.1'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.1'
33
47
  - !ruby/object:Gem::Dependency
34
48
  name: rspec
35
49
  requirement: !ruby/object:Gem::Requirement
@@ -98,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
112
  - !ruby/object:Gem::Version
99
113
  version: '0'
100
114
  requirements: []
101
- rubygems_version: 3.0.3
115
+ rubygems_version: 3.0.8
102
116
  signing_key:
103
117
  specification_version: 4
104
118
  summary: Ruby class for aligning two character strings