text_alignment 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e9a7bcce440fe7a86655b4c8edceb92607c810a4cdf6d05baee3ffe55199db6
4
- data.tar.gz: 739bed915b416c59c908c396f312f6b04df79cbad4075cbcbfab3a82db610fd5
3
+ metadata.gz: 45bf8e55a66daff4d79765aec0e4d7482151aa7b86749e054e5d93030ffffabe
4
+ data.tar.gz: 6251835e35077e2ae828666026a663ae63be7f8d7b7d785b4898763021f546e2
5
5
  SHA512:
6
- metadata.gz: f2fe7ba9a6bee8150572f7813f9ce680c8fd3fe1252292f351082d95293392969d4094e492e04f844c0bc956cbb8934213937e11b75789f4b6432f31c1e7fad0
7
- data.tar.gz: aed5b749142693d86f26c8bdd0d68ba89727f5b8dbab916f0f476d56d253ccf63a0ea908230192d4d5e1511124b515c013fde90ffa6873a28b3124702c926339
6
+ metadata.gz: 4750b2507b4eb172123ca9a82320c2456e66d3f06ca3e3bdd52cf31055e9cb46af53013a484dbb87b1bb3236701cc8a69e1c4c64b576ee41a3a4957c3431be65
7
+ data.tar.gz: 1819b94e47bdf064d53712bb667f15d74f8e37bd49df7976075c3cbdc0a992d25bb318f76bfc0522cea0b594d002bcab0ebcef6112bf9e26128c245a7fb67bcf
data/Gemfile CHANGED
@@ -1,8 +1,9 @@
1
1
  source 'https://rubygems.org'
2
- ruby '2.3.4'
2
+ ruby '2.5.5'
3
3
 
4
4
  gem 'diff-lcs', '~> 1.3'
5
5
  gem 'ruby-dictionary', '~>1.1', '>=1.1.1'
6
+ gem 'string-similarity', '~> 2.1'
6
7
 
7
8
  group :test do
8
9
  gem 'rspec', '~>3.0'
@@ -15,6 +15,7 @@ GEM
15
15
  rspec-support (~> 3.0.0)
16
16
  rspec-support (3.0.4)
17
17
  ruby-dictionary (1.1.1)
18
+ string-similarity (2.1.0)
18
19
 
19
20
  PLATFORMS
20
21
  ruby
@@ -23,9 +24,10 @@ DEPENDENCIES
23
24
  diff-lcs (~> 1.3)
24
25
  rspec (~> 3.0)
25
26
  ruby-dictionary (~> 1.1, >= 1.1.1)
27
+ string-similarity (~> 2.1)
26
28
 
27
29
  RUBY VERSION
28
- ruby 2.3.4p301
30
+ ruby 2.5.5p157
29
31
 
30
32
  BUNDLED WITH
31
33
  1.17.3
@@ -42,6 +42,10 @@ puts
42
42
  puts '[Denotations transformed]'
43
43
  new_denotations = align.transform_hdenotations(denotations)
44
44
  pp new_denotations
45
+ puts
46
+ puts '[Annotations transformed]'
47
+ anns2[:denotations] = new_denotations
48
+ puts anns2.to_json
45
49
 
46
50
  # p align.common_elements
47
51
  # puts "---------------"
@@ -1,11 +1,14 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'string-similarity'
3
+
2
4
  module TextAlignment; end unless defined? TextAlignment
3
5
 
4
6
  # approximate the location of str1 in str2
5
7
  module TextAlignment
6
8
  SIGNATURE_NGRAM = 5
7
9
  MIN_LENGTH_FOR_APPROXIMATION = 50
8
- BUFFER_RATE = 0.2
10
+ BUFFER_RATE = 0.1
11
+ TEXT_SIMILARITY_TRESHOLD = 0.8
9
12
  end
10
13
 
11
14
  class << TextAlignment
@@ -22,29 +25,40 @@ class << TextAlignment
22
25
  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
23
26
  return nil, nil if ngram_shared.empty?
24
27
 
25
- # approximate the beginning of the fit
26
- signature_ngram = ngram_shared.detect{|g| ngram2.count(g) == 1}
28
+ signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
29
+ return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
30
+
31
+ fit_begin, fit_end = nil, nil
32
+ signature_ngrams.each do |signature_ngram|
33
+ loc_signature_ngram_in_str1 = str1.index(signature_ngram)
34
+ loc_signature_ngram_in_str2 = str2.index(signature_ngram)
27
35
 
28
- return nil, nil if signature_ngram.nil? #raise "no signature ngram"
29
- offset = str1.index(signature_ngram)
30
- fit_begin = str2.index(signature_ngram) - offset - (offset * TextAlignment::BUFFER_RATE).to_i
31
- fit_begin = 0 if fit_begin < 0
36
+ # approximate the beginning of the fit
37
+ fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
38
+ fit_begin = 0 if fit_begin < 0
32
39
 
33
- # to change the order according to ngram2
34
- ngram_shared = ngram2 & ngram1
40
+ # approximate the end of the fit
41
+ offset_end = str1.length - loc_signature_ngram_in_str1
42
+ fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
43
+ fit_end = str2.length if fit_end > str2.length
35
44
 
36
- # approximate the end of the fit
37
- ngram_shared_reverse = ngram_shared.reverse
38
- ngram2_reverse = ngram2.reverse
39
- signature_ngram = ngram_shared_reverse.detect{|g| ngram2_reverse.count(g) == 1}
40
- return nil, nil if signature_ngram.nil? # raise "no signature ngram"
41
- offset = str1.length - str1.rindex(signature_ngram)
42
- fit_end = str2.rindex(signature_ngram) + offset + (offset * TextAlignment::BUFFER_RATE).to_i
43
- fit_end = str2.length if fit_end > str2.length
45
+ text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
46
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
47
+ fit_begin, fit_end = nil, nil
48
+ end
44
49
 
45
50
  return nil, nil if fit_begin >= fit_end
46
51
  return fit_begin, fit_end
47
52
  end
53
+
54
+ private
55
+
56
+ def text_similarity(str1, str2, ngram_order = 3)
57
+ _str1 = str1.delete(" \t\r\n")
58
+ _str2 = str2.delete(" \t\r\n")
59
+ String::Similarity.cosine(_str1, _str2, ngram:2)
60
+ end
61
+
48
62
  end
49
63
 
50
64
  if __FILE__ == $0
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.2.6'
2
+ VERSION = '0.2.7'
3
3
  end
@@ -18,5 +18,6 @@ Gem::Specification.new do |gem|
18
18
  gem.require_paths = ['lib']
19
19
 
20
20
  gem.add_development_dependency 'ruby-dictionary', '~>1.1', '>=1.1.1'
21
+ gem.add_development_dependency 'string-similarity', '~> 2.1'
21
22
  gem.add_development_dependency 'rspec', '~>3.0'
22
23
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-07-12 00:00:00.000000000 Z
11
+ date: 2020-07-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -30,6 +30,20 @@ dependencies:
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.1.1
33
+ - !ruby/object:Gem::Dependency
34
+ name: string-similarity
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.1'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.1'
33
47
  - !ruby/object:Gem::Dependency
34
48
  name: rspec
35
49
  requirement: !ruby/object:Gem::Requirement
@@ -98,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
112
  - !ruby/object:Gem::Version
99
113
  version: '0'
100
114
  requirements: []
101
- rubygems_version: 3.0.3
115
+ rubygems_version: 3.0.8
102
116
  signing_key:
103
117
  specification_version: 4
104
118
  summary: Ruby class for aligning two character strings