text_alignment 0.3.21 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 65e1d9b45ff59ac0a233b7656d2aca99d7e4e1051b1a03a0c7726521d4f2b280
4
- data.tar.gz: 710a3b68c5263f26572727e6e9591ebd5fdb095af4633bd5037c61eae0bb5cb6
3
+ metadata.gz: d56b44ee3cc3451b78ddb87134fefd6c043b4730f1170b07069a417bb162bad1
4
+ data.tar.gz: 67318d70bee5bb82127c98e79e0b45e5e337832a2b8d23469474215168579f20
5
5
  SHA512:
6
- metadata.gz: 598df22e41bbbe0a84b6e1a6a4e631ab0d8166810afd652086595feecbf0808a886685f42e5466626cbb1d6950dd9f1181be776b9938d6174dc7735c3ace24cd
7
- data.tar.gz: f7dedfb7e64919129f816fbba24dbd1c2e2a056c242a0865915b8a611f594399b17d051d004a846796bba1c2e89c6fb2f17116cd118ca6217cf1a5dff4f6d4d8
6
+ metadata.gz: 60ae2292556d15d860e0c7377be9c1fd5aec6741b26f562fee8c0c2a9a123d0b8c656b2f7f34b8f74f83ea9ac4878b53f17454e3b25d27d00cd98ecf8175a387
7
+ data.tar.gz: fe8fe545101cddb638c00ffb6f6785f8b2f5ddb109b4010739e399a2c50fe41164d791fe4a5599a76fca5553c494f78fd49630368d48e406a8211fc0516916f5
@@ -108,7 +108,7 @@ else
108
108
  # verification
109
109
  source_text = source_annotations[:text]
110
110
  puts "=====BEGIN"
111
- (0 ... source_text.length).each do |p|
111
+ (0 ... source_text.rstrip.length).each do |p|
112
112
  t = alignment.transform_begin_position(p)
113
113
  if t.nil?
114
114
  print source_text[p]
@@ -120,7 +120,7 @@ else
120
120
  puts "=====END"
121
121
 
122
122
  puts "=====BEGIN"
123
- (0 .. source_text.length).each do |p|
123
+ (0 .. source_text.rstrip.length).each do |p|
124
124
  t = alignment.transform_end_position(p)
125
125
  if t.nil?
126
126
  print source_text[p]
@@ -1,17 +1,15 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'string-similarity'
3
4
 
4
5
  module TextAlignment; end unless defined? TextAlignment
5
6
 
6
- TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
-
10
7
  class TextAlignment::AnchorFinder
11
8
 
12
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
9
+ def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
13
10
  @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
14
11
  @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
+ @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
15
13
 
16
14
  @reverse = (target_str.length < source_str.length)
17
15
 
@@ -30,6 +28,10 @@ class TextAlignment::AnchorFinder
30
28
  def get_next_anchor
31
29
  # find the position of an anchor ngram in s1 and s2
32
30
  while @beg_s1 < (@s1.length - @size_ngram)
31
+ if [' ', "\n", "\t"].include? @s1[@beg_s1]
32
+ @beg_s1 += 1
33
+ next
34
+ end
33
35
  anchor = @s1[@beg_s1, @size_ngram]
34
36
 
35
37
  # search_position = 0
@@ -39,10 +41,10 @@ class TextAlignment::AnchorFinder
39
41
  break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
40
42
 
41
43
  left_window_s1, left_window_s2 = get_left_windows
42
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
44
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
43
45
 
44
46
  right_window_s1, right_window_s2 = get_right_windows
45
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
47
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
46
48
 
47
49
  search_position = @beg_s2 + 1
48
50
  end
@@ -57,9 +59,10 @@ class TextAlignment::AnchorFinder
57
59
  # extend the block
58
60
  b1 = @beg_s1
59
61
  b2 = @beg_s2
60
- while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
62
+ while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
61
63
  b1 -= 1; b2 -= 1
62
64
  end
65
+
63
66
  b1 += 1; b2 += 1
64
67
 
65
68
  e1 = @beg_s1 + @size_ngram
@@ -82,7 +85,8 @@ class TextAlignment::AnchorFinder
82
85
  private
83
86
 
84
87
  def get_left_windows
85
- return if @beg_s1 < @size_window || @beg_s2 < @size_window
88
+ # commend below with the assumption that the beginning of a document gives a significant locational information
89
+ # return if @beg_s1 < @size_window || @beg_s2 < @size_window
86
90
 
87
91
  window_s1 = ''
88
92
  loc = @beg_s1 - 1
@@ -110,7 +114,8 @@ class TextAlignment::AnchorFinder
110
114
  end
111
115
 
112
116
  def get_right_windows
113
- return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
117
+ # commend below with the assumption that the end of a document gives a significant locational
118
+ # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
114
119
 
115
120
  window_s1 = ''
116
121
  loc = @beg_s1 + @size_ngram
@@ -1,13 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'string-similarity'
3
4
 
4
5
  module TextAlignment; end unless defined? TextAlignment
5
6
 
6
7
  # approximate the location of str1 in str2
7
- TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
9
- TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
10
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
11
9
 
12
10
  class << TextAlignment
13
11
 
@@ -16,8 +14,8 @@ class << TextAlignment
16
14
  raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
17
15
  return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
18
16
 
19
- ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
20
- ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
17
+ ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
18
+ ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
21
19
  ngram_shared = ngram1 & ngram2
22
20
 
23
21
  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
@@ -45,7 +43,7 @@ class << TextAlignment
45
43
  text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
46
44
  cache["#{fit_begin}-#{fit_end}"] = text_similarity
47
45
 
48
- break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
46
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
49
47
  fit_begin, fit_end = nil, nil
50
48
  end
51
49
  return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
@@ -0,0 +1,7 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
+ TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
5
+ TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
+ TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
+ TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.87 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
12
- TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
13
-
14
12
  class TextAlignment::GLCSTextAlignment
15
13
  attr_reader :position_map_begin, :position_map_end
16
14
  attr_reader :common_elements, :mapped_elements
@@ -1,22 +1,22 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'text_alignment/anchor_finder'
3
4
  require 'text_alignment/mixed_alignment'
4
5
 
5
6
  module TextAlignment; end unless defined? TextAlignment
6
7
 
7
- TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
- TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
- TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
-
11
-
12
8
  class TextAlignment::TextAlignment
13
9
  attr_reader :block_alignments
14
10
  attr_reader :similarity
15
11
  attr_reader :lost_annotations
16
12
 
17
- def initialize(str1, str2, mappings = nil)
13
+ def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
18
14
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
19
15
 
16
+ size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
17
+ size_window = _size_window || TextAlignment::SIZE_WINDOW
18
+ sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
19
+
20
20
  mappings ||= TextAlignment::MAPPINGS
21
21
 
22
22
  # try exact match
@@ -26,7 +26,7 @@ class TextAlignment::TextAlignment
26
26
  return @block_alignments
27
27
  end
28
28
 
29
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
29
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, size_ngram, size_window, sim_threshold)
30
30
 
31
31
  # To collect matched blocks
32
32
  mblocks = []
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.21'
2
+ VERSION = '0.4.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.21
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-22 00:00:00.000000000 Z
11
+ date: 2020-10-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -77,6 +77,7 @@ files:
77
77
  - lib/text_alignment.rb
78
78
  - lib/text_alignment/anchor_finder.rb
79
79
  - lib/text_alignment/approximate_fit.rb
80
+ - lib/text_alignment/constants.rb
80
81
  - lib/text_alignment/find_divisions.rb
81
82
  - lib/text_alignment/glcs_alignment.rb
82
83
  - lib/text_alignment/glcs_alignment_fast.rb