text_alignment 0.3.22 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aa961ae295a43e3912878dab34fac18a8cbca395eea999a781a538ce17c61bf3
4
- data.tar.gz: 38ae815d192104b9bff4664c5749910675ae4e5d9c895fa63c26fdc24f6d7b91
3
+ metadata.gz: 52bc948955e2df858e397b14eabb4411f73b3ff1e4d879ff4b7015d3b5e03308
4
+ data.tar.gz: fd20caec51c95bdc475e0698a52bb7fdebc9e22c43bb47267a883bcc75862268
5
5
  SHA512:
6
- metadata.gz: 185bdcd0298932a09795723085734d79aaddf638a45c871ff1d29e6c89e02041ddd12f848e3350244b27ac050adc7c52f7e1dedfe6664159d781c25d6dd55f1b
7
- data.tar.gz: 0b88cd8aff0f529b5cad32a360aba476e90d525bc77f89c5c5c59dcef5a6d9077c849acdf9f289f61fea2a50ebe927fd9a53c1eb0bdca9c7661900afdaa8fb94
6
+ metadata.gz: dbcb7ab70a64d4a398a5c5761cc5b2f5de6835ccc0e2d0854556f03ef91d0c0294986cc2ff1273788e6b7b0c73dfdf86fd16ee1ef8ce35ecc11d61f8eaab9521
7
+ data.tar.gz: 01d21cdcc0ab81d61e08ff1f52360ba35973756fd5060ce866391ff622d4cf87da945dba43a622a0581d63ee96c8723e1cb28991bfa02f4e1e803896bdc64d7f
@@ -1,17 +1,15 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'string-similarity'
3
4
 
4
5
  module TextAlignment; end unless defined? TextAlignment
5
6
 
6
- TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
-
10
7
  class TextAlignment::AnchorFinder
11
8
 
12
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
9
+ def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
13
10
  @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
14
11
  @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
+ @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
15
13
 
16
14
  @reverse = (target_str.length < source_str.length)
17
15
 
@@ -43,10 +41,10 @@ class TextAlignment::AnchorFinder
43
41
  break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
44
42
 
45
43
  left_window_s1, left_window_s2 = get_left_windows
46
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
44
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
47
45
 
48
46
  right_window_s1, right_window_s2 = get_right_windows
49
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
47
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
50
48
 
51
49
  search_position = @beg_s2 + 1
52
50
  end
@@ -87,7 +85,8 @@ class TextAlignment::AnchorFinder
87
85
  private
88
86
 
89
87
  def get_left_windows
90
- return if @beg_s1 < @size_window || @beg_s2 < @size_window
88
+ # commend below with the assumption that the beginning of a document gives a significant locational information
89
+ # return if @beg_s1 < @size_window || @beg_s2 < @size_window
91
90
 
92
91
  window_s1 = ''
93
92
  loc = @beg_s1 - 1
@@ -115,7 +114,8 @@ class TextAlignment::AnchorFinder
115
114
  end
116
115
 
117
116
  def get_right_windows
118
- return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
117
+ # commend below with the assumption that the end of a document gives a significant locational
118
+ # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
119
119
 
120
120
  window_s1 = ''
121
121
  loc = @beg_s1 + @size_ngram
@@ -1,13 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'string-similarity'
3
4
 
4
5
  module TextAlignment; end unless defined? TextAlignment
5
6
 
6
7
  # approximate the location of str1 in str2
7
- TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
9
- TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
10
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
11
9
 
12
10
  class << TextAlignment
13
11
 
@@ -16,8 +14,8 @@ class << TextAlignment
16
14
  raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
17
15
  return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
18
16
 
19
- ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
20
- ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
17
+ ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
18
+ ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
21
19
  ngram_shared = ngram1 & ngram2
22
20
 
23
21
  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
@@ -45,7 +43,7 @@ class << TextAlignment
45
43
  text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
46
44
  cache["#{fit_begin}-#{fit_end}"] = text_similarity
47
45
 
48
- break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
46
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
49
47
  fit_begin, fit_end = nil, nil
50
48
  end
51
49
  return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
@@ -0,0 +1,7 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
+ TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
5
+ TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
+ TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
+ TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
12
- TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
13
-
14
12
  class TextAlignment::GLCSTextAlignment
15
13
  attr_reader :position_map_begin, :position_map_end
16
14
  attr_reader :common_elements, :mapped_elements
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
13
- TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
14
-
15
13
  class TextAlignment::MixedAlignment
16
14
  attr_reader :sdiff
17
15
  attr_reader :position_map_begin, :position_map_end
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
21
19
 
22
20
  def initialize(str1, str2, mappings = [])
23
21
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
24
- raise ArgumentError, "nil mappings" if mappings.nil?
25
-
26
- ## preprocessing
27
- str1 = str1.dup
28
- str2 = str2.dup
29
- mappings = mappings.dup
30
-
31
- ## find the first nomatch character
32
- TextAlignment::NOMATCH_CHARS.each_char do |c|
33
- if str2.index(c).nil?
34
- @nomatch_char1 = c
35
- break
36
- end
37
- end
38
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
39
-
40
- ## find the first nomatch character
41
- TextAlignment::NOMATCH_CHARS.each_char do |c|
42
- if c != @nomatch_char1 && str1.index(c).nil?
43
- @nomatch_char2 = c
44
- break
45
- end
46
- end
47
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
48
-
49
- # single character mappings
50
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
51
- characters_from = character_mappings.collect{|m| m[0]}.join
52
- characters_to = character_mappings.collect{|m| m[1]}.join
53
- characters_to.gsub!(/-/, '\-')
54
-
55
- str1.tr!(characters_from, characters_to)
56
- str2.tr!(characters_from, characters_to)
57
-
58
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
59
-
60
- # ASCII foldings
61
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
62
- ascii_foldings.each do |f|
63
- from = f[1]
64
-
65
- if str2.index(f[0])
66
- to = f[0] + (@nomatch_char1 * (f[1].length - 1))
67
- str1.gsub!(from, to)
68
- end
69
-
70
- if str1.index(f[0])
71
- to = f[0] + (@nomatch_char2 * (f[1].length - 1))
72
- str2.gsub!(from, to)
73
- end
74
- end
75
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
22
+ mappings ||= []
76
23
 
77
24
  _compute_mixed_alignment(str1, str2, mappings)
78
25
  end
@@ -1,23 +1,21 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'text_alignment/anchor_finder'
3
4
  require 'text_alignment/mixed_alignment'
4
5
 
5
6
  module TextAlignment; end unless defined? TextAlignment
6
7
 
7
- TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
- TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
- TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
-
8
+ TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
11
9
 
12
10
  class TextAlignment::TextAlignment
13
11
  attr_reader :block_alignments
14
12
  attr_reader :similarity
15
13
  attr_reader :lost_annotations
16
14
 
17
- def initialize(str1, str2, mappings = nil)
18
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
15
+ def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
19
17
 
20
- mappings ||= TextAlignment::MAPPINGS
18
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
21
19
 
22
20
  # try exact match
23
21
  block_begin = str2.index(str1)
@@ -26,7 +24,7 @@ class TextAlignment::TextAlignment
26
24
  return @block_alignments
27
25
  end
28
26
 
29
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
27
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
30
28
 
31
29
  # To collect matched blocks
32
30
  mblocks = []
@@ -232,4 +230,60 @@ class TextAlignment::TextAlignment
232
230
  r
233
231
  end
234
232
 
233
+ private
234
+
235
+
236
+ def string_preprocessing(_str1, _str2)
237
+ str1 = _str1.dup
238
+ str2 = _str2.dup
239
+ mappings = TextAlignment::MAPPINGS.dup
240
+
241
+ ## single character mappings
242
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
243
+ characters_from = character_mappings.collect{|m| m[0]}.join
244
+ characters_to = character_mappings.collect{|m| m[1]}.join
245
+ characters_to.gsub!(/-/, '\-')
246
+
247
+ str1.tr!(characters_from, characters_to)
248
+ str2.tr!(characters_from, characters_to)
249
+
250
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
251
+
252
+ ## long to one character mappings
253
+ pletters = TextAlignment::PADDING_LETTERS
254
+
255
+ # find the padding letter for str1
256
+ padding_letter1 = begin
257
+ i = pletters.index{|l| str2.index(l).nil?}
258
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
259
+ TextAlignment::PADDING_LETTERS[i]
260
+ end
261
+
262
+ # find the padding letter for str2
263
+ padding_letter2 = begin
264
+ i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
265
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
266
+ TextAlignment::PADDING_LETTERS[i]
267
+ end
268
+
269
+ # ASCII foldings
270
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
271
+ ascii_foldings.each do |f|
272
+ from = f[1]
273
+
274
+ if str2.index(f[0])
275
+ to = f[0] + (padding_letter1 * (f[1].length - 1))
276
+ str1.gsub!(from, to)
277
+ end
278
+
279
+ if str1.index(f[0])
280
+ to = f[0] + (padding_letter2 * (f[1].length - 1))
281
+ str2.gsub!(from, to)
282
+ end
283
+ end
284
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
285
+
286
+ [str1, str2, mappings]
287
+ end
288
+
235
289
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.22'
2
+ VERSION = '0.4.3'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.22
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-28 00:00:00.000000000 Z
11
+ date: 2020-10-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -77,6 +77,7 @@ files:
77
77
  - lib/text_alignment.rb
78
78
  - lib/text_alignment/anchor_finder.rb
79
79
  - lib/text_alignment/approximate_fit.rb
80
+ - lib/text_alignment/constants.rb
80
81
  - lib/text_alignment/find_divisions.rb
81
82
  - lib/text_alignment/glcs_alignment.rb
82
83
  - lib/text_alignment/glcs_alignment_fast.rb