text_alignment 0.3.21 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 65e1d9b45ff59ac0a233b7656d2aca99d7e4e1051b1a03a0c7726521d4f2b280
4
- data.tar.gz: 710a3b68c5263f26572727e6e9591ebd5fdb095af4633bd5037c61eae0bb5cb6
3
+ metadata.gz: d56b44ee3cc3451b78ddb87134fefd6c043b4730f1170b07069a417bb162bad1
4
+ data.tar.gz: 67318d70bee5bb82127c98e79e0b45e5e337832a2b8d23469474215168579f20
5
5
  SHA512:
6
- metadata.gz: 598df22e41bbbe0a84b6e1a6a4e631ab0d8166810afd652086595feecbf0808a886685f42e5466626cbb1d6950dd9f1181be776b9938d6174dc7735c3ace24cd
7
- data.tar.gz: f7dedfb7e64919129f816fbba24dbd1c2e2a056c242a0865915b8a611f594399b17d051d004a846796bba1c2e89c6fb2f17116cd118ca6217cf1a5dff4f6d4d8
6
+ metadata.gz: 60ae2292556d15d860e0c7377be9c1fd5aec6741b26f562fee8c0c2a9a123d0b8c656b2f7f34b8f74f83ea9ac4878b53f17454e3b25d27d00cd98ecf8175a387
7
+ data.tar.gz: fe8fe545101cddb638c00ffb6f6785f8b2f5ddb109b4010739e399a2c50fe41164d791fe4a5599a76fca5553c494f78fd49630368d48e406a8211fc0516916f5
@@ -108,7 +108,7 @@ else
108
108
  # verification
109
109
  source_text = source_annotations[:text]
110
110
  puts "=====BEGIN"
111
- (0 ... source_text.length).each do |p|
111
+ (0 ... source_text.rstrip.length).each do |p|
112
112
  t = alignment.transform_begin_position(p)
113
113
  if t.nil?
114
114
  print source_text[p]
@@ -120,7 +120,7 @@ else
120
120
  puts "=====END"
121
121
 
122
122
  puts "=====BEGIN"
123
- (0 .. source_text.length).each do |p|
123
+ (0 .. source_text.rstrip.length).each do |p|
124
124
  t = alignment.transform_end_position(p)
125
125
  if t.nil?
126
126
  print source_text[p]
@@ -1,17 +1,15 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'string-similarity'
3
4
 
4
5
  module TextAlignment; end unless defined? TextAlignment
5
6
 
6
- TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
-
10
7
  class TextAlignment::AnchorFinder
11
8
 
12
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
9
+ def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
13
10
  @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
14
11
  @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
+ @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
15
13
 
16
14
  @reverse = (target_str.length < source_str.length)
17
15
 
@@ -30,6 +28,10 @@ class TextAlignment::AnchorFinder
30
28
  def get_next_anchor
31
29
  # find the position of an anchor ngram in s1 and s2
32
30
  while @beg_s1 < (@s1.length - @size_ngram)
31
+ if [' ', "\n", "\t"].include? @s1[@beg_s1]
32
+ @beg_s1 += 1
33
+ next
34
+ end
33
35
  anchor = @s1[@beg_s1, @size_ngram]
34
36
 
35
37
  # search_position = 0
@@ -39,10 +41,10 @@ class TextAlignment::AnchorFinder
39
41
  break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
40
42
 
41
43
  left_window_s1, left_window_s2 = get_left_windows
42
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
44
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
43
45
 
44
46
  right_window_s1, right_window_s2 = get_right_windows
45
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
47
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
46
48
 
47
49
  search_position = @beg_s2 + 1
48
50
  end
@@ -57,9 +59,10 @@ class TextAlignment::AnchorFinder
57
59
  # extend the block
58
60
  b1 = @beg_s1
59
61
  b2 = @beg_s2
60
- while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
62
+ while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
61
63
  b1 -= 1; b2 -= 1
62
64
  end
65
+
63
66
  b1 += 1; b2 += 1
64
67
 
65
68
  e1 = @beg_s1 + @size_ngram
@@ -82,7 +85,8 @@ class TextAlignment::AnchorFinder
82
85
  private
83
86
 
84
87
  def get_left_windows
85
- return if @beg_s1 < @size_window || @beg_s2 < @size_window
88
+ # commend below with the assumption that the beginning of a document gives a significant locational information
89
+ # return if @beg_s1 < @size_window || @beg_s2 < @size_window
86
90
 
87
91
  window_s1 = ''
88
92
  loc = @beg_s1 - 1
@@ -110,7 +114,8 @@ class TextAlignment::AnchorFinder
110
114
  end
111
115
 
112
116
  def get_right_windows
113
- return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
117
+ # commend below with the assumption that the end of a document gives a significant locational
118
+ # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
114
119
 
115
120
  window_s1 = ''
116
121
  loc = @beg_s1 + @size_ngram
@@ -1,13 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'string-similarity'
3
4
 
4
5
  module TextAlignment; end unless defined? TextAlignment
5
6
 
6
7
  # approximate the location of str1 in str2
7
- TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
9
- TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
10
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
11
9
 
12
10
  class << TextAlignment
13
11
 
@@ -16,8 +14,8 @@ class << TextAlignment
16
14
  raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
17
15
  return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
18
16
 
19
- ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
20
- ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
17
+ ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
18
+ ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
21
19
  ngram_shared = ngram1 & ngram2
22
20
 
23
21
  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
@@ -45,7 +43,7 @@ class << TextAlignment
45
43
  text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
46
44
  cache["#{fit_begin}-#{fit_end}"] = text_similarity
47
45
 
48
- break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
46
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
49
47
  fit_begin, fit_end = nil, nil
50
48
  end
51
49
  return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
@@ -0,0 +1,7 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
+ TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
5
+ TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
+ TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
+ TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.87 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
12
- TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
13
-
14
12
  class TextAlignment::GLCSTextAlignment
15
13
  attr_reader :position_map_begin, :position_map_end
16
14
  attr_reader :common_elements, :mapped_elements
@@ -1,22 +1,22 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'text_alignment/anchor_finder'
3
4
  require 'text_alignment/mixed_alignment'
4
5
 
5
6
  module TextAlignment; end unless defined? TextAlignment
6
7
 
7
- TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
- TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
- TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
-
11
-
12
8
  class TextAlignment::TextAlignment
13
9
  attr_reader :block_alignments
14
10
  attr_reader :similarity
15
11
  attr_reader :lost_annotations
16
12
 
17
- def initialize(str1, str2, mappings = nil)
13
+ def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
18
14
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
19
15
 
16
+ size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
17
+ size_window = _size_window || TextAlignment::SIZE_WINDOW
18
+ sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
19
+
20
20
  mappings ||= TextAlignment::MAPPINGS
21
21
 
22
22
  # try exact match
@@ -26,7 +26,7 @@ class TextAlignment::TextAlignment
26
26
  return @block_alignments
27
27
  end
28
28
 
29
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
29
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, size_ngram, size_window, sim_threshold)
30
30
 
31
31
  # To collect matched blocks
32
32
  mblocks = []
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.21'
2
+ VERSION = '0.4.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.21
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-22 00:00:00.000000000 Z
11
+ date: 2020-10-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -77,6 +77,7 @@ files:
77
77
  - lib/text_alignment.rb
78
78
  - lib/text_alignment/anchor_finder.rb
79
79
  - lib/text_alignment/approximate_fit.rb
80
+ - lib/text_alignment/constants.rb
80
81
  - lib/text_alignment/find_divisions.rb
81
82
  - lib/text_alignment/glcs_alignment.rb
82
83
  - lib/text_alignment/glcs_alignment_fast.rb